diff --git a/README.md b/README.md
new file mode 100644
index 0000000..19387cb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+```bash
+pip install --upgrade pip setuptools wheel
+pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
+pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
+pip install mmdet
+pip install mmpose
+```
\ No newline at end of file
diff --git a/main.py b/main.py
index 25240f6..f49a13f 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,15 @@
from modules.EARSForDL.model import RegressionResNet
from dotenv import load_dotenv
+# New imports for RTMPose
+from mmpose.apis import inference_topdown
+from mmpose.apis import init_model as init_pose_estimator
+from mmpose.evaluation.functional import nms
+from mmpose.registry import VISUALIZERS
+from mmpose.structures import merge_data_samples, split_instances
+from mmpose.utils import adapt_mmdet_pipeline
+from mmdet.apis import inference_detector, init_detector
+
# Load environment variables
load_dotenv()
@@ -27,6 +36,8 @@
CONV_ENABLED = os.getenv("CONV_ENABLED", "True").lower() == "true"
XGBOOST_ENABLED = os.getenv("XGBOOST_ENABLED", "True").lower() == "true"
LIGHTGBM_ENABLED = os.getenv("LIGHTGBM_ENABLED", "True").lower() == "true"
+POSENET_ENABLED = os.getenv("PoseNet_ENABLED", "True").lower() == "true"
+RTMPOSE_ENABLED = os.getenv("RTMPose_ENABLED", "False").lower() == "true"
# Get normalization setting
NORMALIZE_ENABLED = os.getenv("NORMALIZE_ENABLED", "False").lower() == "true"
@@ -89,10 +100,35 @@
return transform(Image.open(image_path).convert("RGB")).unsqueeze(0)
-def process_images(base_dir, output_dir):
+def extract_keypoints_rtmpose(pose_results):
+ if not pose_results:
+ print("No pose results found.")
+ return None
+
+ max_avg_visible = 0
+ best_instance = None
+ for result in pose_results:
+ pred_instances = result.pred_instances
+ for instance in pred_instances:
+ avg_visible = np.mean(instance.keypoints_visible)
+ if avg_visible > max_avg_visible:
+ max_avg_visible = avg_visible
+ best_instance = instance
+
+ if best_instance is None:
+ print("No valid instances found.")
+ return None
+
+ keypoints = best_instance.keypoints[0]
+ return keypoints
+
+
+def process_images(args, detector, pose_estimator, visualizer):
+ print("Starting process_images function...")
ears_ai = EarsAI()
calc_position = CalcStethoscopePosition()
- results_dir = os.path.join(output_dir, "results")
+ base_dir = os.path.join(args.output_dir, "frames")
+ results_dir = os.path.join(os.path.dirname(args.output_dir), "results")
csv_path = os.path.join(results_dir, "results.csv")
normalized_csv_path = os.path.join(results_dir, "results-convert.csv")
pose_overlay_dir = os.path.join(results_dir, "pose_overlay_image")
@@ -106,17 +142,59 @@
[f for f in os.listdir(base_dir) if f.lower().endswith(".png")],
key=lambda x: int(re.search(r"(\d+)", x).group(1)),
)
+ print(f"Found {len(png_files)} PNG files.")
rows = []
normalized_rows = []
for image_file_name in png_files:
+ print(f"Processing image: {image_file_name}")
image_path = os.path.join(base_dir, image_file_name)
frame = cv2.imread(image_path)
if frame is None:
print(f"Failed to load image: {image_path}")
continue
- pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None)
+ if POSENET_ENABLED:
+ pose_overlay_img, *landmarks = ears_ai.pose_detect(frame, None)
+ left_shoulder = landmarks[0]
+ right_shoulder = landmarks[1]
+ left_hip = landmarks[2]
+ right_hip = landmarks[3]
+
+ elif RTMPOSE_ENABLED:
+ det_result = inference_detector(detector, frame)
+ pred_instance = det_result.pred_instances.cpu().numpy()
+ bboxes = np.concatenate((pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
+ bboxes = bboxes[np.logical_and(pred_instance.labels == 0, pred_instance.scores > 0.3)]
+ bboxes = bboxes[nms(bboxes, 0.3), :4]
+ pose_results = inference_topdown(pose_estimator, frame, bboxes)
+ data_samples = merge_data_samples(pose_results)
+ keypoints = extract_keypoints_rtmpose(pose_results)
+ if keypoints is None:
+ print(f"Failed to extract keypoints for image: {image_path}")
+ continue
+ left_shoulder = keypoints[5]
+ right_shoulder = keypoints[6]
+ left_hip = keypoints[11]
+ right_hip = keypoints[12]
+ if visualizer is not None:
+ visualizer.add_datasample(
+ "result",
+ frame,
+ data_sample=data_samples,
+ draw_gt=False,
+ draw_heatmap=False,
+ draw_bbox=False,
+ show_kpt_idx=False,
+ skeleton_style="mmpose",
+ show=False,
+ wait_time=0,
+ kpt_thr=0.3,
+ )
+ pose_overlay_img = visualizer.get_image()
+ else:
+ print("No pose estimation method enabled. Please enable either PoseNet or RTMPose.")
+ continue
stethoscope_overlay_img, stethoscope_x, stethoscope_y = ears_ai.ssd_detect(frame, None)
cv2.imwrite(os.path.join(pose_overlay_dir, image_file_name), cv2.cvtColor(pose_overlay_img, cv2.COLOR_RGB2BGR))
@@ -127,14 +205,14 @@
row = {
"image_file_name": image_file_name,
- "left_shoulder_x": landmarks[0][1],
- "left_shoulder_y": landmarks[0][0],
- "right_shoulder_x": landmarks[1][1],
- "right_shoulder_y": landmarks[1][0],
- "left_hip_x": landmarks[2][1],
- "left_hip_y": landmarks[2][0],
- "right_hip_x": landmarks[3][1],
- "right_hip_y": landmarks[3][0],
+ "left_shoulder_x": left_shoulder[1],
+ "left_shoulder_y": left_shoulder[0],
+ "right_shoulder_x": right_shoulder[1],
+ "right_shoulder_y": right_shoulder[0],
+ "left_hip_x": left_hip[1],
+ "left_hip_y": left_hip[0],
+ "right_hip_x": right_hip[1],
+ "right_hip_y": right_hip[0],
"stethoscope_x": stethoscope_x,
"stethoscope_y": stethoscope_y,
}
@@ -166,6 +244,7 @@
normalized_rows.append(normalized_row)
if rows:
+ print(f"Writing {len(rows)} rows to CSV...")
fieldnames = list(rows[0].keys())
if CONV_ENABLED:
fieldnames.extend(["conv_stethoscope_x", "conv_stethoscope_y"])
@@ -247,16 +326,14 @@
print(f"Processed and saved results to: {csv_path}")
print(f"Processed and saved normalized results to: {normalized_csv_path}")
- generate_visualizations(csv_path, base_dir, output_dir)
+ generate_visualizations(csv_path, base_dir, results_dir)
else:
print("No data to write to CSV.")
-def generate_visualizations(csv_path, original_images_dir, output_dir):
+def generate_visualizations(csv_path, original_images_dir, results_dir):
df = pd.read_csv(csv_path)
body_image = cv2.imread("./images/body/BodyF.png")
- results_dir = os.path.join(output_dir, "results")
- os.makedirs(results_dir, exist_ok=True)
dirs = {"marked": "marked_images"}
if CONV_ENABLED:
@@ -266,7 +343,6 @@
if LIGHTGBM_ENABLED:
dirs["lightGBM"] = "lightGBM"
- # 必要なディレクトリを作成
os.makedirs(os.path.join(results_dir, "marked_images"), exist_ok=True)
for key in dirs:
if key != "marked":
@@ -282,7 +358,6 @@
print(f"Failed to load image: {os.path.join(original_images_dir, row['image_file_name'])}")
continue
- # Draw markers on original image
for point in ["left_shoulder", "right_shoulder", "left_hip", "right_hip", "stethoscope"]:
cv2.circle(original_image, (int(row[f"{point}_x"]), int(row[f"{point}_y"])), 10, (255, 255, 0), -1)
@@ -292,7 +367,6 @@
x, y = int(row[f"{key}_stethoscope_x"]), int(row[f"{key}_stethoscope_y"])
points[key].append((x, y))
- # Create image with trajectory
image_with_trajectory = body_image.copy()
if len(points[key]) > 1:
cv2.polylines(image_with_trajectory, [np.array(points[key])], False, colors[key], 2)
@@ -301,7 +375,6 @@
os.path.join(results_dir, f"{dirs[key]}_with_trajectory", row["image_file_name"]), image_with_trajectory
)
- # Create image without trajectory
image_without_trajectory = body_image.copy()
cv2.circle(image_without_trajectory, (x, y), 10, colors[key], -1)
cv2.imwrite(
@@ -309,7 +382,6 @@
image_without_trajectory,
)
- # Generate videos
create_video_from_images(os.path.join(results_dir, "marked_images"), os.path.join(results_dir, "marked_video.mp4"))
for key in dirs:
@@ -347,16 +419,36 @@
print(f"Created video: {output_path}")
-if __name__ == "__main__":
+def main():
+ parser = argparse.ArgumentParser(description="Process video and generate results.")
parser = argparse.ArgumentParser(description="Process video and generate results.")
parser.add_argument("--video_path", default="./video/Test3-1.mp4", help="Path to the input video file")
parser.add_argument("--output_dir", default="output", help="Directory to save output images and results")
+ det_config = "modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py"
+ det_checkpoint = (
+ "https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth"
+ )
+ pose_config = "modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py"
+ pose_checkpoint = "https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth"
args = parser.parse_args()
- # Create output directory structure
os.makedirs(args.output_dir, exist_ok=True)
frames_dir = os.path.join(args.output_dir, "frames")
video_to_frames(args.video_path, frames_dir)
- process_images(frames_dir, args.output_dir)
+
+ if RTMPOSE_ENABLED:
+ detector = init_detector(det_config, det_checkpoint, device="cuda:0")
+ detector.cfg = adapt_mmdet_pipeline(detector.cfg)
+ pose_estimator = init_pose_estimator(pose_config, pose_checkpoint, device="cuda:0")
+ visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer)
+ visualizer.set_dataset_meta(pose_estimator.dataset_meta, skeleton_style="mmpose")
+
+ process_images(args, detector, pose_estimator, visualizer)
+ else:
+ process_images(args, None, None, None)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/modules/rtmpose/configs/_base_/datasets/300vw.py b/modules/rtmpose/configs/_base_/datasets/300vw.py
new file mode 100644
index 0000000..5d5ff02
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/300vw.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+ dataset_name='300vw',
+ paper_info=dict(
+ author='Jie Shen, Stefanos Zafeiriou, Grigorios G. Chrysos, '
+ 'Jean Kossaifi, Georgios Tzimiropoulos, Maja Pantic',
+ title='The First Facial Landmark Tracking in-the-Wild Challenge: '
+ 'Benchmark and Results',
+ container='Proceedings of the IEEE '
+ 'international conference on computer vision workshops',
+ year='2016',
+ homepage='https://ibug.doc.ic.ac.uk/resources/300-VW/',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-16'),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-15'),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-14'),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-13'),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-12'),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-11'),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-10'),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-9'),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-7'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-6'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-5'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-4'),
+ 13:
+ dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-3'),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-2'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-1'),
+ 16:
+ dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-0'),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-26'),
+ 18:
+ dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-25'),
+ 19:
+ dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-24'),
+ 20:
+ dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-23'),
+ 21:
+ dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-22'),
+ 22:
+ dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-21'),
+ 23:
+ dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-20'),
+ 24:
+ dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-19'),
+ 25:
+ dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-18'),
+ 26:
+ dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-17'),
+ 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''),
+ 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''),
+ 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''),
+ 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''),
+ 31:
+ dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-35'),
+ 32:
+ dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-34'),
+ 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''),
+ 34:
+ dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-32'),
+ 35:
+ dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-31'),
+ 36:
+ dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-45'),
+ 37:
+ dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-44'),
+ 38:
+ dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-43'),
+ 39:
+ dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-42'),
+ 40:
+ dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-47'),
+ 41: dict(
+ name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-46'),
+ 42: dict(
+ name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-39'),
+ 43: dict(
+ name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-38'),
+ 44: dict(
+ name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-37'),
+ 45: dict(
+ name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-36'),
+ 46: dict(
+ name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-41'),
+ 47: dict(
+ name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-40'),
+ 48: dict(
+ name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-54'),
+ 49: dict(
+ name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-53'),
+ 50: dict(
+ name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-52'),
+ 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''),
+ 52: dict(
+ name='kpt-52', id=52, color=[255, 0, 0], type='', swap='kpt-50'),
+ 53: dict(
+ name='kpt-53', id=53, color=[255, 0, 0], type='', swap='kpt-49'),
+ 54: dict(
+ name='kpt-54', id=54, color=[255, 0, 0], type='', swap='kpt-48'),
+ 55: dict(
+ name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'),
+ 56: dict(
+ name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'),
+ 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''),
+ 58: dict(
+ name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'),
+ 59: dict(
+ name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'),
+ 60: dict(
+ name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-64'),
+ 61: dict(
+ name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-63'),
+ 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''),
+ 63: dict(
+ name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-61'),
+ 64: dict(
+ name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-60'),
+ 65: dict(
+ name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-67'),
+ 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''),
+ 67: dict(
+ name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-65'),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 68,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/300w.py b/modules/rtmpose/configs/_base_/datasets/300w.py
new file mode 100644
index 0000000..2920023
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/300w.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+ dataset_name='300w',
+ paper_info=dict(
+ author='Sagonas, Christos and Antonakos, Epameinondas '
+ 'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
+ 'and Pantic, Maja',
+ title='300 faces in-the-wild challenge: '
+ 'Database and results',
+ container='Image and vision computing',
+ year='2016',
+ homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-16'),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-15'),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-14'),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-13'),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-12'),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-11'),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-10'),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-9'),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-7'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-6'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-5'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-4'),
+ 13:
+ dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-3'),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-2'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-1'),
+ 16:
+ dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-0'),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-26'),
+ 18:
+ dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-25'),
+ 19:
+ dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-24'),
+ 20:
+ dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-23'),
+ 21:
+ dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-22'),
+ 22:
+ dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-21'),
+ 23:
+ dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-20'),
+ 24:
+ dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-19'),
+ 25:
+ dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-18'),
+ 26:
+ dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-17'),
+ 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''),
+ 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''),
+ 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''),
+ 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''),
+ 31:
+ dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-35'),
+ 32:
+ dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-34'),
+ 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''),
+ 34:
+ dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-32'),
+ 35:
+ dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-31'),
+ 36:
+ dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-45'),
+ 37:
+ dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-44'),
+ 38:
+ dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-43'),
+ 39:
+ dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-42'),
+ 40:
+ dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-47'),
+ 41: dict(
+ name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-46'),
+ 42: dict(
+ name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-39'),
+ 43: dict(
+ name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-38'),
+ 44: dict(
+ name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-37'),
+ 45: dict(
+ name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-36'),
+ 46: dict(
+ name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-41'),
+ 47: dict(
+ name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-40'),
+ 48: dict(
+ name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-54'),
+ 49: dict(
+ name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-53'),
+ 50: dict(
+ name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-52'),
+ 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''),
+ 52: dict(
+ name='kpt-52', id=52, color=[255, 0, 0], type='', swap='kpt-50'),
+ 53: dict(
+ name='kpt-53', id=53, color=[255, 0, 0], type='', swap='kpt-49'),
+ 54: dict(
+ name='kpt-54', id=54, color=[255, 0, 0], type='', swap='kpt-48'),
+ 55: dict(
+ name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'),
+ 56: dict(
+ name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'),
+ 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''),
+ 58: dict(
+ name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'),
+ 59: dict(
+ name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'),
+ 60: dict(
+ name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-64'),
+ 61: dict(
+ name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-63'),
+ 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''),
+ 63: dict(
+ name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-61'),
+ 64: dict(
+ name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-60'),
+ 65: dict(
+ name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-67'),
+ 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''),
+ 67: dict(
+ name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-65'),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 68,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/300wlp.py b/modules/rtmpose/configs/_base_/datasets/300wlp.py
new file mode 100644
index 0000000..b7e2648
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/300wlp.py
@@ -0,0 +1,86 @@
+dataset_info = dict(
+ dataset_name='300wlp',
+ paper_info=dict(
+ author='Xiangyu Zhu1, and Zhen Lei1 '
+ 'and Xiaoming Liu2, and Hailin Shi1 '
+ 'and Stan Z. Li1',
+ title='300 faces in-the-wild challenge: '
+ 'Database and results',
+ container='Image and vision computing',
+ year='2016',
+ homepage='http://www.cbsr.ia.ac.cn/users/xiangyuzhu/'
+ 'projects/3DDFA/main.htm',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap=''),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap=''),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap=''),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap=''),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap=''),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap=''),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap=''),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap=''),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap=''),
+ 10: dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap=''),
+ 11: dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap=''),
+ 12: dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap=''),
+ 13: dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap=''),
+ 14: dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap=''),
+ 15: dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap=''),
+ 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''),
+ 17: dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap=''),
+ 18: dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap=''),
+ 19: dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap=''),
+ 20: dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap=''),
+ 21: dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap=''),
+ 22: dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap=''),
+ 23: dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap=''),
+ 24: dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap=''),
+ 25: dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap=''),
+ 26: dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap=''),
+ 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''),
+ 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''),
+ 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''),
+ 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''),
+ 31: dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap=''),
+ 32: dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap=''),
+ 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''),
+ 34: dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap=''),
+ 35: dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap=''),
+ 36: dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap=''),
+ 37: dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap=''),
+ 38: dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap=''),
+ 39: dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap=''),
+ 40: dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap=''),
+ 41: dict(name='kpt-41', id=41, color=[255, 0, 0], type='', swap=''),
+ 42: dict(name='kpt-42', id=42, color=[255, 0, 0], type='', swap=''),
+ 43: dict(name='kpt-43', id=43, color=[255, 0, 0], type='', swap=''),
+ 44: dict(name='kpt-44', id=44, color=[255, 0, 0], type='', swap=''),
+ 45: dict(name='kpt-45', id=45, color=[255, 0, 0], type='', swap=''),
+ 46: dict(name='kpt-46', id=46, color=[255, 0, 0], type='', swap=''),
+ 47: dict(name='kpt-47', id=47, color=[255, 0, 0], type='', swap=''),
+ 48: dict(name='kpt-48', id=48, color=[255, 0, 0], type='', swap=''),
+ 49: dict(name='kpt-49', id=49, color=[255, 0, 0], type='', swap=''),
+ 50: dict(name='kpt-50', id=50, color=[255, 0, 0], type='', swap=''),
+ 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''),
+ 52: dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''),
+ 53: dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''),
+ 54: dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''),
+ 55: dict(name='kpt-55', id=55, color=[255, 0, 0], type='', swap=''),
+ 56: dict(name='kpt-56', id=56, color=[255, 0, 0], type='', swap=''),
+ 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''),
+ 58: dict(name='kpt-58', id=58, color=[255, 0, 0], type='', swap=''),
+ 59: dict(name='kpt-59', id=59, color=[255, 0, 0], type='', swap=''),
+ 60: dict(name='kpt-60', id=60, color=[255, 0, 0], type='', swap=''),
+ 61: dict(name='kpt-61', id=61, color=[255, 0, 0], type='', swap=''),
+ 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''),
+ 63: dict(name='kpt-63', id=63, color=[255, 0, 0], type='', swap=''),
+ 64: dict(name='kpt-64', id=64, color=[255, 0, 0], type='', swap=''),
+ 65: dict(name='kpt-65', id=65, color=[255, 0, 0], type='', swap=''),
+ 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''),
+ 67: dict(name='kpt-67', id=67, color=[255, 0, 0], type='', swap=''),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 68,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/aflw.py b/modules/rtmpose/configs/_base_/datasets/aflw.py
new file mode 100644
index 0000000..e092de6
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/aflw.py
@@ -0,0 +1,44 @@
+dataset_info = dict(
+ dataset_name='aflw',
+ paper_info=dict(
+ author='Koestinger, Martin and Wohlhart, Paul and '
+ 'Roth, Peter M and Bischof, Horst',
+ title='Annotated facial landmarks in the wild: '
+ 'A large-scale, real-world database for facial '
+ 'landmark localization',
+ container='2011 IEEE international conference on computer '
+ 'vision workshops (ICCV workshops)',
+ year='2011',
+ homepage='https://www.tugraz.at/institute/icg/research/'
+ 'team-bischof/lrs/downloads/aflw/',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-5'),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-4'),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-1'),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-0'),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-11'),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-10'),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-7'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-6'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'),
+ 13: dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap=''),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'),
+ 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'),
+ 18: dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='')
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 19,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/aic.py b/modules/rtmpose/configs/_base_/datasets/aic.py
new file mode 100644
index 0000000..8d30f60
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/aic.py
@@ -0,0 +1,140 @@
+dataset_info = dict(
+ dataset_name='aic',
+ paper_info=dict(
+ author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+ 'Li, Yixin and Yan, Baoming and Liang, Rui and '
+ 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+ 'Fu, Yanwei and others',
+ title='Ai challenger: A large-scale dataset for going '
+ 'deeper in image understanding',
+ container='arXiv',
+ year='2017',
+ homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_shoulder',
+ id=0,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 1:
+ dict(
+ name='right_elbow',
+ id=1,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 2:
+ dict(
+ name='right_wrist',
+ id=2,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 3:
+ dict(
+ name='left_shoulder',
+ id=3,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 4:
+ dict(
+ name='left_elbow',
+ id=4,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 5:
+ dict(
+ name='left_wrist',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 6:
+ dict(
+ name='right_hip',
+ id=6,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 7:
+ dict(
+ name='right_knee',
+ id=7,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 8:
+ dict(
+ name='right_ankle',
+ id=8,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 9:
+ dict(
+ name='left_hip',
+ id=9,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 10:
+ dict(
+ name='left_knee',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 11:
+ dict(
+ name='left_ankle',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 12:
+ dict(
+ name='head_top',
+ id=12,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 13:
+ dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+ 1: dict(
+ link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+ 2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+ 3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+ 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+ 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+ 6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+ 7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+ 8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+ 9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+ 10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+ 11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+ 12: dict(
+ link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+ ],
+
+ # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+ # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+ # delta = 2 x sigma
+ sigmas=[
+ 0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+ 0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+ 0.01291456, 0.01236173
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/ak.py b/modules/rtmpose/configs/_base_/datasets/ak.py
new file mode 100644
index 0000000..ddc9e27
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/ak.py
@@ -0,0 +1,267 @@
+dataset_info = dict(
+ dataset_name='Animal Kingdom',
+ paper_info=dict(
+ author='Singapore University of Technology and Design, Singapore.'
+ ' Xun Long Ng, Kian Eng Ong, Qichen Zheng,'
+ ' Yun Ni, Si Yong Yeo, Jun Liu.',
+ title='Animal Kingdom: '
+ 'A Large and Diverse Dataset for Animal Behavior Understanding',
+ container='Conference on Computer Vision '
+ 'and Pattern Recognition (CVPR)',
+ year='2022',
+ homepage='https://sutdcv.github.io/Animal-Kingdom',
+ version='1.0 (2022-06)',
+ date_created='2022-06',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='Head_Mid_Top',
+ id=0,
+ color=(225, 0, 255),
+ type='upper',
+ swap=''),
+ 1:
+ dict(
+ name='Eye_Left',
+ id=1,
+ color=[220, 20, 60],
+ type='upper',
+ swap='Eye_Right'),
+ 2:
+ dict(
+ name='Eye_Right',
+ id=2,
+ color=[0, 255, 255],
+ type='upper',
+ swap='Eye_Left'),
+ 3:
+ dict(
+ name='Mouth_Front_Top',
+ id=3,
+ color=(0, 255, 42),
+ type='upper',
+ swap=''),
+ 4:
+ dict(
+ name='Mouth_Back_Left',
+ id=4,
+ color=[221, 160, 221],
+ type='upper',
+ swap='Mouth_Back_Right'),
+ 5:
+ dict(
+ name='Mouth_Back_Right',
+ id=5,
+ color=[135, 206, 250],
+ type='upper',
+ swap='Mouth_Back_Left'),
+ 6:
+ dict(
+ name='Mouth_Front_Bottom',
+ id=6,
+ color=[50, 205, 50],
+ type='upper',
+ swap=''),
+ 7:
+ dict(
+ name='Shoulder_Left',
+ id=7,
+ color=[255, 182, 193],
+ type='upper',
+ swap='Shoulder_Right'),
+ 8:
+ dict(
+ name='Shoulder_Right',
+ id=8,
+ color=[0, 191, 255],
+ type='upper',
+ swap='Shoulder_Left'),
+ 9:
+ dict(
+ name='Elbow_Left',
+ id=9,
+ color=[255, 105, 180],
+ type='upper',
+ swap='Elbow_Right'),
+ 10:
+ dict(
+ name='Elbow_Right',
+ id=10,
+ color=[30, 144, 255],
+ type='upper',
+ swap='Elbow_Left'),
+ 11:
+ dict(
+ name='Wrist_Left',
+ id=11,
+ color=[255, 20, 147],
+ type='upper',
+ swap='Wrist_Right'),
+ 12:
+ dict(
+ name='Wrist_Right',
+ id=12,
+ color=[0, 0, 255],
+ type='upper',
+ swap='Wrist_Left'),
+ 13:
+ dict(
+ name='Torso_Mid_Back',
+ id=13,
+ color=(185, 3, 221),
+ type='upper',
+ swap=''),
+ 14:
+ dict(
+ name='Hip_Left',
+ id=14,
+ color=[255, 215, 0],
+ type='lower',
+ swap='Hip_Right'),
+ 15:
+ dict(
+ name='Hip_Right',
+ id=15,
+ color=[147, 112, 219],
+ type='lower',
+ swap='Hip_Left'),
+ 16:
+ dict(
+ name='Knee_Left',
+ id=16,
+ color=[255, 165, 0],
+ type='lower',
+ swap='Knee_Right'),
+ 17:
+ dict(
+ name='Knee_Right',
+ id=17,
+ color=[138, 43, 226],
+ type='lower',
+ swap='Knee_Left'),
+ 18:
+ dict(
+ name='Ankle_Left',
+ id=18,
+ color=[255, 140, 0],
+ type='lower',
+ swap='Ankle_Right'),
+ 19:
+ dict(
+ name='Ankle_Right',
+ id=19,
+ color=[128, 0, 128],
+ type='lower',
+ swap='Ankle_Left'),
+ 20:
+ dict(
+ name='Tail_Top_Back',
+ id=20,
+ color=(0, 251, 255),
+ type='lower',
+ swap=''),
+ 21:
+ dict(
+ name='Tail_Mid_Back',
+ id=21,
+ color=[32, 178, 170],
+ type='lower',
+ swap=''),
+ 22:
+ dict(
+ name='Tail_End_Back',
+ id=22,
+ color=(0, 102, 102),
+ type='lower',
+ swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('Eye_Left', 'Head_Mid_Top'), id=0, color=[220, 20, 60]),
+ 1:
+ dict(link=('Eye_Right', 'Head_Mid_Top'), id=1, color=[0, 255, 255]),
+ 2:
+ dict(
+ link=('Mouth_Front_Top', 'Mouth_Back_Left'),
+ id=2,
+ color=[221, 160, 221]),
+ 3:
+ dict(
+ link=('Mouth_Front_Top', 'Mouth_Back_Right'),
+ id=3,
+ color=[135, 206, 250]),
+ 4:
+ dict(
+ link=('Mouth_Front_Bottom', 'Mouth_Back_Left'),
+ id=4,
+ color=[221, 160, 221]),
+ 5:
+ dict(
+ link=('Mouth_Front_Bottom', 'Mouth_Back_Right'),
+ id=5,
+ color=[135, 206, 250]),
+ 6:
+ dict(
+ link=('Head_Mid_Top', 'Torso_Mid_Back'), id=6,
+ color=(225, 0, 255)),
+ 7:
+ dict(
+ link=('Torso_Mid_Back', 'Tail_Top_Back'),
+ id=7,
+ color=(185, 3, 221)),
+ 8:
+ dict(
+ link=('Tail_Top_Back', 'Tail_Mid_Back'), id=8,
+ color=(0, 251, 255)),
+ 9:
+ dict(
+ link=('Tail_Mid_Back', 'Tail_End_Back'),
+ id=9,
+ color=[32, 178, 170]),
+ 10:
+ dict(
+ link=('Head_Mid_Top', 'Shoulder_Left'),
+ id=10,
+ color=[255, 182, 193]),
+ 11:
+ dict(
+ link=('Head_Mid_Top', 'Shoulder_Right'),
+ id=11,
+ color=[0, 191, 255]),
+ 12:
+ dict(
+ link=('Shoulder_Left', 'Elbow_Left'), id=12, color=[255, 105,
+ 180]),
+ 13:
+ dict(
+ link=('Shoulder_Right', 'Elbow_Right'),
+ id=13,
+ color=[30, 144, 255]),
+ 14:
+ dict(link=('Elbow_Left', 'Wrist_Left'), id=14, color=[255, 20, 147]),
+ 15:
+ dict(link=('Elbow_Right', 'Wrist_Right'), id=15, color=[0, 0, 255]),
+ 16:
+ dict(link=('Tail_Top_Back', 'Hip_Left'), id=16, color=[255, 215, 0]),
+ 17:
+ dict(
+ link=('Tail_Top_Back', 'Hip_Right'), id=17, color=[147, 112, 219]),
+ 18:
+ dict(link=('Hip_Left', 'Knee_Left'), id=18, color=[255, 165, 0]),
+ 19:
+ dict(link=('Hip_Right', 'Knee_Right'), id=19, color=[138, 43, 226]),
+ 20:
+ dict(link=('Knee_Left', 'Ankle_Left'), id=20, color=[255, 140, 0]),
+ 21:
+ dict(link=('Knee_Right', 'Ankle_Right'), id=21, color=[128, 0, 128])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+ 1., 1., 1., 1., 1.
+ ],
+ sigmas=[
+ 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025,
+ 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025,
+ 0.025, 0.025, 0.025
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/animalpose.py b/modules/rtmpose/configs/_base_/datasets/animalpose.py
new file mode 100644
index 0000000..7f614f7
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/animalpose.py
@@ -0,0 +1,166 @@
+dataset_info = dict(
+ dataset_name='animalpose',
+ paper_info=dict(
+ author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
+ 'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
+ title='Cross-Domain Adaptation for Animal Pose Estimation',
+ container='The IEEE International Conference on '
+ 'Computer Vision (ICCV)',
+ year='2019',
+ homepage='https://sites.google.com/view/animal-pose/',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+ 1:
+ dict(
+ name='R_Eye',
+ id=1,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_Eye'),
+ 2:
+ dict(
+ name='L_EarBase',
+ id=2,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_EarBase'),
+ 3:
+ dict(
+ name='R_EarBase',
+ id=3,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_EarBase'),
+ 4:
+ dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
+ 5:
+ dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
+ 6:
+ dict(
+ name='TailBase', id=6, color=[51, 153, 255], type='lower',
+ swap=''),
+ 7:
+ dict(
+ name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
+ 8:
+ dict(
+ name='L_F_Elbow',
+ id=8,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_F_Elbow'),
+ 9:
+ dict(
+ name='R_F_Elbow',
+ id=9,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_F_Elbow'),
+ 10:
+ dict(
+ name='L_B_Elbow',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_B_Elbow'),
+ 11:
+ dict(
+ name='R_B_Elbow',
+ id=11,
+ color=[255, 128, 0],
+ type='lower',
+ swap='L_B_Elbow'),
+ 12:
+ dict(
+ name='L_F_Knee',
+ id=12,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_F_Knee'),
+ 13:
+ dict(
+ name='R_F_Knee',
+ id=13,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_F_Knee'),
+ 14:
+ dict(
+ name='L_B_Knee',
+ id=14,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_B_Knee'),
+ 15:
+ dict(
+ name='R_B_Knee',
+ id=15,
+ color=[255, 128, 0],
+ type='lower',
+ swap='L_B_Knee'),
+ 16:
+ dict(
+ name='L_F_Paw',
+ id=16,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_F_Paw'),
+ 17:
+ dict(
+ name='R_F_Paw',
+ id=17,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_F_Paw'),
+ 18:
+ dict(
+ name='L_B_Paw',
+ id=18,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_B_Paw'),
+ 19:
+ dict(
+ name='R_B_Paw',
+ id=19,
+ color=[255, 128, 0],
+ type='lower',
+ swap='L_B_Paw')
+ },
+ skeleton_info={
+ 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
+ 1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
+ 2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
+ 3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
+ 5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
+ 6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
+ 7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
+ 8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
+ 9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
+ 10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
+ 11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
+ 12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
+ 13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
+ 14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
+ 15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
+ 16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
+ 17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
+ 18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
+ 19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
+ 1.5, 1.5, 1.5, 1.5
+ ],
+
+ # Note: The original paper did not provide enough information about
+ # the sigmas. We modified from 'https://github.com/cocodataset/'
+ # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
+ sigmas=[
+ 0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
+ 0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/ap10k.py b/modules/rtmpose/configs/_base_/datasets/ap10k.py
new file mode 100644
index 0000000..aecc173
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/ap10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+ dataset_name='ap10k',
+ paper_info=dict(
+ author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+ 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+ title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+ container='35th Conference on Neural Information Processing Systems '
+ '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+ year='2021',
+ homepage='https://github.com/AlexTheBad/AP-10K',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+ 1:
+ dict(
+ name='R_Eye',
+ id=1,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_Eye'),
+ 2:
+ dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+ 3:
+ dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+ 4:
+ dict(
+ name='Root of tail',
+ id=4,
+ color=[51, 153, 255],
+ type='lower',
+ swap=''),
+ 5:
+ dict(
+ name='L_Shoulder',
+ id=5,
+ color=[51, 153, 255],
+ type='upper',
+ swap='R_Shoulder'),
+ 6:
+ dict(
+ name='L_Elbow',
+ id=6,
+ color=[51, 153, 255],
+ type='upper',
+ swap='R_Elbow'),
+ 7:
+ dict(
+ name='L_F_Paw',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_F_Paw'),
+ 8:
+ dict(
+ name='R_Shoulder',
+ id=8,
+ color=[0, 255, 0],
+ type='upper',
+ swap='L_Shoulder'),
+ 9:
+ dict(
+ name='R_Elbow',
+ id=9,
+ color=[255, 128, 0],
+ type='upper',
+ swap='L_Elbow'),
+ 10:
+ dict(
+ name='R_F_Paw',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_F_Paw'),
+ 11:
+ dict(
+ name='L_Hip',
+ id=11,
+ color=[255, 128, 0],
+ type='lower',
+ swap='R_Hip'),
+ 12:
+ dict(
+ name='L_Knee',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='R_Knee'),
+ 13:
+ dict(
+ name='L_B_Paw',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_B_Paw'),
+ 14:
+ dict(
+ name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+ swap='L_Hip'),
+ 15:
+ dict(
+ name='R_Knee',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Knee'),
+ 16:
+ dict(
+ name='R_B_Paw',
+ id=16,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_B_Paw'),
+ },
+ skeleton_info={
+ 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+ 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+ 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+ 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+ 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+ 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+ 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+ 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+ 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+ 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+ 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+ 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+ 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+ 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+ 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+ 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+ 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/atrw.py b/modules/rtmpose/configs/_base_/datasets/atrw.py
new file mode 100644
index 0000000..84d3fb3
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/atrw.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+ dataset_name='atrw',
+ paper_info=dict(
+ author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
+ 'and Qian, Rui and Lin, Weiyao',
+ title='ATRW: A Benchmark for Amur Tiger '
+ 'Re-identification in the Wild',
+ container='Proceedings of the 28th ACM '
+ 'International Conference on Multimedia',
+ year='2020',
+ homepage='https://cvwc2019.github.io/challenge.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left_ear',
+ id=0,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 1:
+ dict(
+ name='right_ear',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 2:
+ dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+ 3:
+ dict(
+ name='right_shoulder',
+ id=3,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 4:
+ dict(
+ name='right_front_paw',
+ id=4,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_front_paw'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='left_front_paw',
+ id=6,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_front_paw'),
+ 7:
+ dict(
+ name='right_hip',
+ id=7,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 8:
+ dict(
+ name='right_knee',
+ id=8,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 9:
+ dict(
+ name='right_back_paw',
+ id=9,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_back_paw'),
+ 10:
+ dict(
+ name='left_hip',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 11:
+ dict(
+ name='left_knee',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 12:
+ dict(
+ name='left_back_paw',
+ id=12,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_back_paw'),
+ 13:
+ dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
+ 14:
+ dict(
+ name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
+ 1:
+ dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
+ 2:
+ dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
+ 3:
+ dict(
+ link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
+ 5:
+ dict(
+ link=('right_shoulder', 'right_front_paw'),
+ id=5,
+ color=[255, 128, 0]),
+ 6:
+ dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
+ 7:
+ dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
+ 9:
+ dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
+ 11:
+ dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
+ 12:
+ dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
+ },
+ joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ sigmas=[
+ 0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
+ 0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/campus.py b/modules/rtmpose/configs/_base_/datasets/campus.py
new file mode 100644
index 0000000..06cc7ec
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/campus.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+ dataset_name='campus',
+ paper_info=dict(
+ author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+ 'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+ title='3D Pictorial Structures for Multiple Human Pose Estimation',
+ container='IEEE Computer Society Conference on Computer Vision and '
+ 'Pattern Recognition (CVPR)',
+ year='2014',
+ homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_ankle',
+ id=0,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 1:
+ dict(
+ name='right_knee',
+ id=1,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 2:
+ dict(
+ name='right_hip',
+ id=2,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 3:
+ dict(
+ name='left_hip',
+ id=3,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 4:
+ dict(
+ name='left_knee',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 5:
+ dict(
+ name='left_ankle',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 6:
+ dict(
+ name='right_wrist',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 7:
+ dict(
+ name='right_elbow',
+ id=7,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 8:
+ dict(
+ name='right_shoulder',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 9:
+ dict(
+ name='left_shoulder',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 10:
+ dict(
+ name='left_elbow',
+ id=10,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 11:
+ dict(
+ name='left_wrist',
+ id=11,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 12:
+ dict(
+ name='bottom_head',
+ id=12,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 13:
+ dict(
+ name='top_head',
+ id=13,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+ 3:
+ dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(
+ link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+ 7:
+ dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+ 8:
+ dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(
+ link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+ 0]),
+ 12:
+ dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+ },
+ joint_weights=[
+ 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+ ],
+ sigmas=[
+ 0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+ 0.072, 0.062, 0.026, 0.026
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco.py b/modules/rtmpose/configs/_base_/datasets/coco.py
new file mode 100644
index 0000000..787e834
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+ dataset_name='coco',
+ paper_info=dict(
+ author='Lin, Tsung-Yi and Maire, Michael and '
+ 'Belongie, Serge and Hays, James and '
+ 'Perona, Pietro and Ramanan, Deva and '
+ r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+ title='Microsoft coco: Common objects in context',
+ container='European conference on computer vision',
+ year='2014',
+ homepage='http://cocodataset.org/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_aic.py b/modules/rtmpose/configs/_base_/datasets/coco_aic.py
new file mode 100644
index 0000000..edd636b
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_aic.py
@@ -0,0 +1,205 @@
+dataset_info = dict(
+ dataset_name='coco',
+ paper_info=[
+ dict(
+ author='Lin, Tsung-Yi and Maire, Michael and '
+ 'Belongie, Serge and Hays, James and '
+ 'Perona, Pietro and Ramanan, Deva and '
+ r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+ title='Microsoft coco: Common objects in context',
+ container='European conference on computer vision',
+ year='2014',
+ homepage='http://cocodataset.org/',
+ ),
+ dict(
+ author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+ 'Li, Yixin and Yan, Baoming and Liang, Rui and '
+ 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+ 'Fu, Yanwei and others',
+ title='Ai challenger: A large-scale dataset for going '
+ 'deeper in image understanding',
+ container='arXiv',
+ year='2017',
+ homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+ ),
+ ],
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='head_top',
+ id=17,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 18:
+ dict(name='neck', id=18, color=[51, 153, 255], type='upper', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5, 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.026, 0.026
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_openpose.py b/modules/rtmpose/configs/_base_/datasets/coco_openpose.py
new file mode 100644
index 0000000..7bab501
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_openpose.py
@@ -0,0 +1,157 @@
+dataset_info = dict(
+ dataset_name='coco_openpose',
+ paper_info=dict(
+ author='Zhe, Cao and Tomas, Simon and '
+ 'Shih-En, Wei and Yaser, Sheikh',
+ title='OpenPose: Realtime Multi-Person 2D Pose '
+ 'Estimation using Part Affinity Fields',
+ container='IEEE Transactions on Pattern Analysis '
+ 'and Machine Intelligence',
+ year='2019',
+ homepage='https://github.com/CMU-Perceptual-Computing-Lab/openpose/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[255, 0, 0], type='upper', swap=''),
+ 1:
+ dict(name='neck', id=1, color=[255, 85, 0], type='upper', swap=''),
+ 2:
+ dict(
+ name='right_shoulder',
+ id=2,
+ color=[255, 170, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[255, 255, 0],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='right_wrist',
+ id=4,
+ color=[170, 255, 0],
+ type='upper',
+ swap='left_wrist'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[85, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='left_elbow',
+ id=6,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 7:
+ dict(
+ name='left_wrist',
+ id=7,
+ color=[0, 255, 85],
+ type='upper',
+ swap='right_wrist'),
+ 8:
+ dict(
+ name='right_hip',
+ id=8,
+ color=[0, 255, 170],
+ type='lower',
+ swap='left_hip'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[0, 255, 255],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='right_ankle',
+ id=10,
+ color=[0, 170, 255],
+ type='lower',
+ swap='left_ankle'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 85, 255],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='left_knee',
+ id=12,
+ color=[0, 0, 255],
+ type='lower',
+ swap='right_knee'),
+ 13:
+ dict(
+ name='left_ankle',
+ id=13,
+ color=[85, 0, 255],
+ type='lower',
+ swap='right_ankle'),
+ 14:
+ dict(
+ name='right_eye',
+ id=14,
+ color=[170, 0, 255],
+ type='upper',
+ swap='left_eye'),
+ 15:
+ dict(
+ name='left_eye',
+ id=15,
+ color=[255, 0, 255],
+ type='upper',
+ swap='right_eye'),
+ 16:
+ dict(
+ name='right_ear',
+ id=16,
+ color=[255, 0, 170],
+ type='upper',
+ swap='left_ear'),
+ 17:
+ dict(
+ name='left_ear',
+ id=17,
+ color=[255, 0, 85],
+ type='upper',
+ swap='right_ear'),
+ },
+ skeleton_info={
+ 0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 0, 0]),
+ 1: dict(link=('neck', 'left_shoulder'), id=1, color=[255, 85, 0]),
+ 2: dict(
+ link=('right_shoulder', 'right_elbow'), id=2, color=[255, 170, 0]),
+ 3:
+ dict(link=('right_elbow', 'right_wrist'), id=3, color=[255, 255, 0]),
+ 4:
+ dict(link=('left_shoulder', 'left_elbow'), id=4, color=[170, 255, 0]),
+ 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[85, 255, 0]),
+ 6: dict(link=('neck', 'right_hip'), id=6, color=[0, 255, 0]),
+ 7: dict(link=('right_hip', 'right_knee'), id=7, color=[0, 255, 85]),
+ 8: dict(link=('right_knee', 'right_ankle'), id=8, color=[0, 255, 170]),
+ 9: dict(link=('neck', 'left_hip'), id=9, color=[0, 255, 225]),
+ 10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 170, 255]),
+ 11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 85, 255]),
+ 12: dict(link=('neck', 'nose'), id=12, color=[0, 0, 255]),
+ 13: dict(link=('nose', 'right_eye'), id=13, color=[255, 0, 170]),
+ 14: dict(link=('right_eye', 'right_ear'), id=14, color=[170, 0, 255]),
+ 15: dict(link=('nose', 'left_eye'), id=15, color=[255, 0, 255]),
+ 16: dict(link=('left_eye', 'left_ear'), id=16, color=[255, 0, 170]),
+ },
+ joint_weights=[1.] * 18,
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.082
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py
new file mode 100644
index 0000000..a739c97
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody.py
@@ -0,0 +1,1154 @@
+dataset_info = dict(
+ dataset_name='coco_wholebody',
+ paper_info=dict(
+ author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+ 'Wang, Can and Liu, Wentao and '
+ 'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+ title='Whole-Body Human Pose Estimation in the Wild',
+ container='Proceedings of the European '
+ 'Conference on Computer Vision (ECCV)',
+ year='2020',
+ homepage='https://github.com/jin-s13/COCO-WholeBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='left_big_toe',
+ id=17,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 18:
+ dict(
+ name='left_small_toe',
+ id=18,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 19:
+ dict(
+ name='left_heel',
+ id=19,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_heel'),
+ 20:
+ dict(
+ name='right_big_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 21:
+ dict(
+ name='right_small_toe',
+ id=21,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 22:
+ dict(
+ name='right_heel',
+ id=22,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_heel'),
+ 23:
+ dict(
+ name='face-0',
+ id=23,
+ color=[255, 255, 255],
+ type='',
+ swap='face-16'),
+ 24:
+ dict(
+ name='face-1',
+ id=24,
+ color=[255, 255, 255],
+ type='',
+ swap='face-15'),
+ 25:
+ dict(
+ name='face-2',
+ id=25,
+ color=[255, 255, 255],
+ type='',
+ swap='face-14'),
+ 26:
+ dict(
+ name='face-3',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='face-13'),
+ 27:
+ dict(
+ name='face-4',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='face-12'),
+ 28:
+ dict(
+ name='face-5',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='face-11'),
+ 29:
+ dict(
+ name='face-6',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='face-10'),
+ 30:
+ dict(
+ name='face-7',
+ id=30,
+ color=[255, 255, 255],
+ type='',
+ swap='face-9'),
+ 31:
+ dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+ 32:
+ dict(
+ name='face-9',
+ id=32,
+ color=[255, 255, 255],
+ type='',
+ swap='face-7'),
+ 33:
+ dict(
+ name='face-10',
+ id=33,
+ color=[255, 255, 255],
+ type='',
+ swap='face-6'),
+ 34:
+ dict(
+ name='face-11',
+ id=34,
+ color=[255, 255, 255],
+ type='',
+ swap='face-5'),
+ 35:
+ dict(
+ name='face-12',
+ id=35,
+ color=[255, 255, 255],
+ type='',
+ swap='face-4'),
+ 36:
+ dict(
+ name='face-13',
+ id=36,
+ color=[255, 255, 255],
+ type='',
+ swap='face-3'),
+ 37:
+ dict(
+ name='face-14',
+ id=37,
+ color=[255, 255, 255],
+ type='',
+ swap='face-2'),
+ 38:
+ dict(
+ name='face-15',
+ id=38,
+ color=[255, 255, 255],
+ type='',
+ swap='face-1'),
+ 39:
+ dict(
+ name='face-16',
+ id=39,
+ color=[255, 255, 255],
+ type='',
+ swap='face-0'),
+ 40:
+ dict(
+ name='face-17',
+ id=40,
+ color=[255, 255, 255],
+ type='',
+ swap='face-26'),
+ 41:
+ dict(
+ name='face-18',
+ id=41,
+ color=[255, 255, 255],
+ type='',
+ swap='face-25'),
+ 42:
+ dict(
+ name='face-19',
+ id=42,
+ color=[255, 255, 255],
+ type='',
+ swap='face-24'),
+ 43:
+ dict(
+ name='face-20',
+ id=43,
+ color=[255, 255, 255],
+ type='',
+ swap='face-23'),
+ 44:
+ dict(
+ name='face-21',
+ id=44,
+ color=[255, 255, 255],
+ type='',
+ swap='face-22'),
+ 45:
+ dict(
+ name='face-22',
+ id=45,
+ color=[255, 255, 255],
+ type='',
+ swap='face-21'),
+ 46:
+ dict(
+ name='face-23',
+ id=46,
+ color=[255, 255, 255],
+ type='',
+ swap='face-20'),
+ 47:
+ dict(
+ name='face-24',
+ id=47,
+ color=[255, 255, 255],
+ type='',
+ swap='face-19'),
+ 48:
+ dict(
+ name='face-25',
+ id=48,
+ color=[255, 255, 255],
+ type='',
+ swap='face-18'),
+ 49:
+ dict(
+ name='face-26',
+ id=49,
+ color=[255, 255, 255],
+ type='',
+ swap='face-17'),
+ 50:
+ dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+ 51:
+ dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+ 52:
+ dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+ 53:
+ dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+ 54:
+ dict(
+ name='face-31',
+ id=54,
+ color=[255, 255, 255],
+ type='',
+ swap='face-35'),
+ 55:
+ dict(
+ name='face-32',
+ id=55,
+ color=[255, 255, 255],
+ type='',
+ swap='face-34'),
+ 56:
+ dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+ 57:
+ dict(
+ name='face-34',
+ id=57,
+ color=[255, 255, 255],
+ type='',
+ swap='face-32'),
+ 58:
+ dict(
+ name='face-35',
+ id=58,
+ color=[255, 255, 255],
+ type='',
+ swap='face-31'),
+ 59:
+ dict(
+ name='face-36',
+ id=59,
+ color=[255, 255, 255],
+ type='',
+ swap='face-45'),
+ 60:
+ dict(
+ name='face-37',
+ id=60,
+ color=[255, 255, 255],
+ type='',
+ swap='face-44'),
+ 61:
+ dict(
+ name='face-38',
+ id=61,
+ color=[255, 255, 255],
+ type='',
+ swap='face-43'),
+ 62:
+ dict(
+ name='face-39',
+ id=62,
+ color=[255, 255, 255],
+ type='',
+ swap='face-42'),
+ 63:
+ dict(
+ name='face-40',
+ id=63,
+ color=[255, 255, 255],
+ type='',
+ swap='face-47'),
+ 64:
+ dict(
+ name='face-41',
+ id=64,
+ color=[255, 255, 255],
+ type='',
+ swap='face-46'),
+ 65:
+ dict(
+ name='face-42',
+ id=65,
+ color=[255, 255, 255],
+ type='',
+ swap='face-39'),
+ 66:
+ dict(
+ name='face-43',
+ id=66,
+ color=[255, 255, 255],
+ type='',
+ swap='face-38'),
+ 67:
+ dict(
+ name='face-44',
+ id=67,
+ color=[255, 255, 255],
+ type='',
+ swap='face-37'),
+ 68:
+ dict(
+ name='face-45',
+ id=68,
+ color=[255, 255, 255],
+ type='',
+ swap='face-36'),
+ 69:
+ dict(
+ name='face-46',
+ id=69,
+ color=[255, 255, 255],
+ type='',
+ swap='face-41'),
+ 70:
+ dict(
+ name='face-47',
+ id=70,
+ color=[255, 255, 255],
+ type='',
+ swap='face-40'),
+ 71:
+ dict(
+ name='face-48',
+ id=71,
+ color=[255, 255, 255],
+ type='',
+ swap='face-54'),
+ 72:
+ dict(
+ name='face-49',
+ id=72,
+ color=[255, 255, 255],
+ type='',
+ swap='face-53'),
+ 73:
+ dict(
+ name='face-50',
+ id=73,
+ color=[255, 255, 255],
+ type='',
+ swap='face-52'),
+ 74:
+ dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+ 75:
+ dict(
+ name='face-52',
+ id=75,
+ color=[255, 255, 255],
+ type='',
+ swap='face-50'),
+ 76:
+ dict(
+ name='face-53',
+ id=76,
+ color=[255, 255, 255],
+ type='',
+ swap='face-49'),
+ 77:
+ dict(
+ name='face-54',
+ id=77,
+ color=[255, 255, 255],
+ type='',
+ swap='face-48'),
+ 78:
+ dict(
+ name='face-55',
+ id=78,
+ color=[255, 255, 255],
+ type='',
+ swap='face-59'),
+ 79:
+ dict(
+ name='face-56',
+ id=79,
+ color=[255, 255, 255],
+ type='',
+ swap='face-58'),
+ 80:
+ dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+ 81:
+ dict(
+ name='face-58',
+ id=81,
+ color=[255, 255, 255],
+ type='',
+ swap='face-56'),
+ 82:
+ dict(
+ name='face-59',
+ id=82,
+ color=[255, 255, 255],
+ type='',
+ swap='face-55'),
+ 83:
+ dict(
+ name='face-60',
+ id=83,
+ color=[255, 255, 255],
+ type='',
+ swap='face-64'),
+ 84:
+ dict(
+ name='face-61',
+ id=84,
+ color=[255, 255, 255],
+ type='',
+ swap='face-63'),
+ 85:
+ dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+ 86:
+ dict(
+ name='face-63',
+ id=86,
+ color=[255, 255, 255],
+ type='',
+ swap='face-61'),
+ 87:
+ dict(
+ name='face-64',
+ id=87,
+ color=[255, 255, 255],
+ type='',
+ swap='face-60'),
+ 88:
+ dict(
+ name='face-65',
+ id=88,
+ color=[255, 255, 255],
+ type='',
+ swap='face-67'),
+ 89:
+ dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+ 90:
+ dict(
+ name='face-67',
+ id=90,
+ color=[255, 255, 255],
+ type='',
+ swap='face-65'),
+ 91:
+ dict(
+ name='left_hand_root',
+ id=91,
+ color=[255, 255, 255],
+ type='',
+ swap='right_hand_root'),
+ 92:
+ dict(
+ name='left_thumb1',
+ id=92,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb1'),
+ 93:
+ dict(
+ name='left_thumb2',
+ id=93,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb2'),
+ 94:
+ dict(
+ name='left_thumb3',
+ id=94,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb3'),
+ 95:
+ dict(
+ name='left_thumb4',
+ id=95,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb4'),
+ 96:
+ dict(
+ name='left_forefinger1',
+ id=96,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger1'),
+ 97:
+ dict(
+ name='left_forefinger2',
+ id=97,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger2'),
+ 98:
+ dict(
+ name='left_forefinger3',
+ id=98,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger3'),
+ 99:
+ dict(
+ name='left_forefinger4',
+ id=99,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger4'),
+ 100:
+ dict(
+ name='left_middle_finger1',
+ id=100,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 101:
+ dict(
+ name='left_middle_finger2',
+ id=101,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 102:
+ dict(
+ name='left_middle_finger3',
+ id=102,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 103:
+ dict(
+ name='left_middle_finger4',
+ id=103,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 104:
+ dict(
+ name='left_ring_finger1',
+ id=104,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger1'),
+ 105:
+ dict(
+ name='left_ring_finger2',
+ id=105,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger2'),
+ 106:
+ dict(
+ name='left_ring_finger3',
+ id=106,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger3'),
+ 107:
+ dict(
+ name='left_ring_finger4',
+ id=107,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger4'),
+ 108:
+ dict(
+ name='left_pinky_finger1',
+ id=108,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger1'),
+ 109:
+ dict(
+ name='left_pinky_finger2',
+ id=109,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger2'),
+ 110:
+ dict(
+ name='left_pinky_finger3',
+ id=110,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger3'),
+ 111:
+ dict(
+ name='left_pinky_finger4',
+ id=111,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger4'),
+ 112:
+ dict(
+ name='right_hand_root',
+ id=112,
+ color=[255, 255, 255],
+ type='',
+ swap='left_hand_root'),
+ 113:
+ dict(
+ name='right_thumb1',
+ id=113,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb1'),
+ 114:
+ dict(
+ name='right_thumb2',
+ id=114,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb2'),
+ 115:
+ dict(
+ name='right_thumb3',
+ id=115,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb3'),
+ 116:
+ dict(
+ name='right_thumb4',
+ id=116,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb4'),
+ 117:
+ dict(
+ name='right_forefinger1',
+ id=117,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger1'),
+ 118:
+ dict(
+ name='right_forefinger2',
+ id=118,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger2'),
+ 119:
+ dict(
+ name='right_forefinger3',
+ id=119,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger3'),
+ 120:
+ dict(
+ name='right_forefinger4',
+ id=120,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger4'),
+ 121:
+ dict(
+ name='right_middle_finger1',
+ id=121,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 122:
+ dict(
+ name='right_middle_finger2',
+ id=122,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 123:
+ dict(
+ name='right_middle_finger3',
+ id=123,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 124:
+ dict(
+ name='right_middle_finger4',
+ id=124,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 125:
+ dict(
+ name='right_ring_finger1',
+ id=125,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger1'),
+ 126:
+ dict(
+ name='right_ring_finger2',
+ id=126,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger2'),
+ 127:
+ dict(
+ name='right_ring_finger3',
+ id=127,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger3'),
+ 128:
+ dict(
+ name='right_ring_finger4',
+ id=128,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger4'),
+ 129:
+ dict(
+ name='right_pinky_finger1',
+ id=129,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger1'),
+ 130:
+ dict(
+ name='right_pinky_finger2',
+ id=130,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger2'),
+ 131:
+ dict(
+ name='right_pinky_finger3',
+ id=131,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger3'),
+ 132:
+ dict(
+ name='right_pinky_finger4',
+ id=132,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger4')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+ 20:
+ dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+ 21:
+ dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=23,
+ color=[255, 128, 0]),
+ 24:
+ dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+ 0]),
+ 26:
+ dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+ 27:
+ dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+ 28:
+ dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+ 29:
+ dict(
+ link=('left_hand_root', 'left_forefinger1'),
+ id=29,
+ color=[255, 153, 255]),
+ 30:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=30,
+ color=[255, 153, 255]),
+ 31:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=31,
+ color=[255, 153, 255]),
+ 32:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=32,
+ color=[255, 153, 255]),
+ 33:
+ dict(
+ link=('left_hand_root', 'left_middle_finger1'),
+ id=33,
+ color=[102, 178, 255]),
+ 34:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=34,
+ color=[102, 178, 255]),
+ 35:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=35,
+ color=[102, 178, 255]),
+ 36:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=36,
+ color=[102, 178, 255]),
+ 37:
+ dict(
+ link=('left_hand_root', 'left_ring_finger1'),
+ id=37,
+ color=[255, 51, 51]),
+ 38:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=38,
+ color=[255, 51, 51]),
+ 39:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=39,
+ color=[255, 51, 51]),
+ 40:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=40,
+ color=[255, 51, 51]),
+ 41:
+ dict(
+ link=('left_hand_root', 'left_pinky_finger1'),
+ id=41,
+ color=[0, 255, 0]),
+ 42:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=42,
+ color=[0, 255, 0]),
+ 43:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=43,
+ color=[0, 255, 0]),
+ 44:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=44,
+ color=[0, 255, 0]),
+ 45:
+ dict(
+ link=('right_hand_root', 'right_thumb1'),
+ id=45,
+ color=[255, 128, 0]),
+ 46:
+ dict(
+ link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+ 47:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+ 48:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+ 49:
+ dict(
+ link=('right_hand_root', 'right_forefinger1'),
+ id=49,
+ color=[255, 153, 255]),
+ 50:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=50,
+ color=[255, 153, 255]),
+ 51:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=51,
+ color=[255, 153, 255]),
+ 52:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=52,
+ color=[255, 153, 255]),
+ 53:
+ dict(
+ link=('right_hand_root', 'right_middle_finger1'),
+ id=53,
+ color=[102, 178, 255]),
+ 54:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=54,
+ color=[102, 178, 255]),
+ 55:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=55,
+ color=[102, 178, 255]),
+ 56:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=56,
+ color=[102, 178, 255]),
+ 57:
+ dict(
+ link=('right_hand_root', 'right_ring_finger1'),
+ id=57,
+ color=[255, 51, 51]),
+ 58:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=58,
+ color=[255, 51, 51]),
+ 59:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=59,
+ color=[255, 51, 51]),
+ 60:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=60,
+ color=[255, 51, 51]),
+ 61:
+ dict(
+ link=('right_hand_root', 'right_pinky_finger1'),
+ id=61,
+ color=[0, 255, 0]),
+ 62:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=62,
+ color=[0, 255, 0]),
+ 63:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=63,
+ color=[0, 255, 0]),
+ 64:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=64,
+ color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 133,
+ # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+ # 'evaluation/myeval_wholebody.py#L175'
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+ 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+ 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+ 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+ 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+ 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+ 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+ 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+ 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+ 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+ 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+ 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+ 0.019, 0.022, 0.031
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py
new file mode 100644
index 0000000..e208671
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_face.py
@@ -0,0 +1,154 @@
+dataset_info = dict(
+ dataset_name='coco_wholebody_face',
+ paper_info=dict(
+ author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+ 'Wang, Can and Liu, Wentao and '
+ 'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+ title='Whole-Body Human Pose Estimation in the Wild',
+ container='Proceedings of the European '
+ 'Conference on Computer Vision (ECCV)',
+ year='2020',
+ homepage='https://github.com/jin-s13/COCO-WholeBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='face-0', id=0, color=[255, 0, 0], type='', swap='face-16'),
+ 1:
+ dict(name='face-1', id=1, color=[255, 0, 0], type='', swap='face-15'),
+ 2:
+ dict(name='face-2', id=2, color=[255, 0, 0], type='', swap='face-14'),
+ 3:
+ dict(name='face-3', id=3, color=[255, 0, 0], type='', swap='face-13'),
+ 4:
+ dict(name='face-4', id=4, color=[255, 0, 0], type='', swap='face-12'),
+ 5:
+ dict(name='face-5', id=5, color=[255, 0, 0], type='', swap='face-11'),
+ 6:
+ dict(name='face-6', id=6, color=[255, 0, 0], type='', swap='face-10'),
+ 7:
+ dict(name='face-7', id=7, color=[255, 0, 0], type='', swap='face-9'),
+ 8: dict(name='face-8', id=8, color=[255, 0, 0], type='', swap=''),
+ 9:
+ dict(name='face-9', id=9, color=[255, 0, 0], type='', swap='face-7'),
+ 10:
+ dict(name='face-10', id=10, color=[255, 0, 0], type='', swap='face-6'),
+ 11:
+ dict(name='face-11', id=11, color=[255, 0, 0], type='', swap='face-5'),
+ 12:
+ dict(name='face-12', id=12, color=[255, 0, 0], type='', swap='face-4'),
+ 13:
+ dict(name='face-13', id=13, color=[255, 0, 0], type='', swap='face-3'),
+ 14:
+ dict(name='face-14', id=14, color=[255, 0, 0], type='', swap='face-2'),
+ 15:
+ dict(name='face-15', id=15, color=[255, 0, 0], type='', swap='face-1'),
+ 16:
+ dict(name='face-16', id=16, color=[255, 0, 0], type='', swap='face-0'),
+ 17: dict(
+ name='face-17', id=17, color=[255, 0, 0], type='', swap='face-26'),
+ 18: dict(
+ name='face-18', id=18, color=[255, 0, 0], type='', swap='face-25'),
+ 19: dict(
+ name='face-19', id=19, color=[255, 0, 0], type='', swap='face-24'),
+ 20: dict(
+ name='face-20', id=20, color=[255, 0, 0], type='', swap='face-23'),
+ 21: dict(
+ name='face-21', id=21, color=[255, 0, 0], type='', swap='face-22'),
+ 22: dict(
+ name='face-22', id=22, color=[255, 0, 0], type='', swap='face-21'),
+ 23: dict(
+ name='face-23', id=23, color=[255, 0, 0], type='', swap='face-20'),
+ 24: dict(
+ name='face-24', id=24, color=[255, 0, 0], type='', swap='face-19'),
+ 25: dict(
+ name='face-25', id=25, color=[255, 0, 0], type='', swap='face-18'),
+ 26: dict(
+ name='face-26', id=26, color=[255, 0, 0], type='', swap='face-17'),
+ 27: dict(name='face-27', id=27, color=[255, 0, 0], type='', swap=''),
+ 28: dict(name='face-28', id=28, color=[255, 0, 0], type='', swap=''),
+ 29: dict(name='face-29', id=29, color=[255, 0, 0], type='', swap=''),
+ 30: dict(name='face-30', id=30, color=[255, 0, 0], type='', swap=''),
+ 31: dict(
+ name='face-31', id=31, color=[255, 0, 0], type='', swap='face-35'),
+ 32: dict(
+ name='face-32', id=32, color=[255, 0, 0], type='', swap='face-34'),
+ 33: dict(name='face-33', id=33, color=[255, 0, 0], type='', swap=''),
+ 34: dict(
+ name='face-34', id=34, color=[255, 0, 0], type='', swap='face-32'),
+ 35: dict(
+ name='face-35', id=35, color=[255, 0, 0], type='', swap='face-31'),
+ 36: dict(
+ name='face-36', id=36, color=[255, 0, 0], type='', swap='face-45'),
+ 37: dict(
+ name='face-37', id=37, color=[255, 0, 0], type='', swap='face-44'),
+ 38: dict(
+ name='face-38', id=38, color=[255, 0, 0], type='', swap='face-43'),
+ 39: dict(
+ name='face-39', id=39, color=[255, 0, 0], type='', swap='face-42'),
+ 40: dict(
+ name='face-40', id=40, color=[255, 0, 0], type='', swap='face-47'),
+ 41: dict(
+ name='face-41', id=41, color=[255, 0, 0], type='', swap='face-46'),
+ 42: dict(
+ name='face-42', id=42, color=[255, 0, 0], type='', swap='face-39'),
+ 43: dict(
+ name='face-43', id=43, color=[255, 0, 0], type='', swap='face-38'),
+ 44: dict(
+ name='face-44', id=44, color=[255, 0, 0], type='', swap='face-37'),
+ 45: dict(
+ name='face-45', id=45, color=[255, 0, 0], type='', swap='face-36'),
+ 46: dict(
+ name='face-46', id=46, color=[255, 0, 0], type='', swap='face-41'),
+ 47: dict(
+ name='face-47', id=47, color=[255, 0, 0], type='', swap='face-40'),
+ 48: dict(
+ name='face-48', id=48, color=[255, 0, 0], type='', swap='face-54'),
+ 49: dict(
+ name='face-49', id=49, color=[255, 0, 0], type='', swap='face-53'),
+ 50: dict(
+ name='face-50', id=50, color=[255, 0, 0], type='', swap='face-52'),
+ 51: dict(name='face-51', id=52, color=[255, 0, 0], type='', swap=''),
+ 52: dict(
+ name='face-52', id=52, color=[255, 0, 0], type='', swap='face-50'),
+ 53: dict(
+ name='face-53', id=53, color=[255, 0, 0], type='', swap='face-49'),
+ 54: dict(
+ name='face-54', id=54, color=[255, 0, 0], type='', swap='face-48'),
+ 55: dict(
+ name='face-55', id=55, color=[255, 0, 0], type='', swap='face-59'),
+ 56: dict(
+ name='face-56', id=56, color=[255, 0, 0], type='', swap='face-58'),
+ 57: dict(name='face-57', id=57, color=[255, 0, 0], type='', swap=''),
+ 58: dict(
+ name='face-58', id=58, color=[255, 0, 0], type='', swap='face-56'),
+ 59: dict(
+ name='face-59', id=59, color=[255, 0, 0], type='', swap='face-55'),
+ 60: dict(
+ name='face-60', id=60, color=[255, 0, 0], type='', swap='face-64'),
+ 61: dict(
+ name='face-61', id=61, color=[255, 0, 0], type='', swap='face-63'),
+ 62: dict(name='face-62', id=62, color=[255, 0, 0], type='', swap=''),
+ 63: dict(
+ name='face-63', id=63, color=[255, 0, 0], type='', swap='face-61'),
+ 64: dict(
+ name='face-64', id=64, color=[255, 0, 0], type='', swap='face-60'),
+ 65: dict(
+ name='face-65', id=65, color=[255, 0, 0], type='', swap='face-67'),
+ 66: dict(name='face-66', id=66, color=[255, 0, 0], type='', swap=''),
+ 67: dict(
+ name='face-67', id=67, color=[255, 0, 0], type='', swap='face-65')
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 68,
+
+ # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+ # 'evaluation/myeval_wholebody.py#L177'
+ sigmas=[
+ 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023,
+ 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011,
+ 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007,
+ 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+ 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008,
+ 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007,
+ 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py
new file mode 100644
index 0000000..585ed78
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_hand.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+ dataset_name='coco_wholebody_hand',
+ paper_info=dict(
+ author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+ 'Wang, Can and Liu, Wentao and '
+ 'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+ title='Whole-Body Human Pose Estimation in the Wild',
+ container='Proceedings of the European '
+ 'Conference on Computer Vision (ECCV)',
+ year='2020',
+ homepage='https://github.com/jin-s13/COCO-WholeBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+ 9:
+ dict(
+ name='middle_finger1',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger3',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='middle_finger4',
+ id=12,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 13:
+ dict(
+ name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(
+ name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+ 17:
+ dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[
+ 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+ 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+ 0.031
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py
new file mode 100644
index 0000000..315c0fb
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/coco_wholebody_openpose.py
@@ -0,0 +1,1128 @@
+dataset_info = dict(
+ dataset_name='coco_wholebody_openpose',
+ paper_info=dict(
+ author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+ 'Wang, Can and Liu, Wentao and '
+ 'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+ title='Whole-Body Human Pose Estimation in the Wild',
+ container='Proceedings of the European '
+ 'Conference on Computer Vision (ECCV)',
+ year='2020',
+ homepage='https://github.com/jin-s13/COCO-WholeBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[255, 0, 0], type='upper', swap=''),
+ 1:
+ dict(name='neck', id=1, color=[255, 85, 0], type='upper', swap=''),
+ 2:
+ dict(
+ name='right_shoulder',
+ id=2,
+ color=[255, 170, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[255, 255, 0],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='right_wrist',
+ id=4,
+ color=[170, 255, 0],
+ type='upper',
+ swap='left_wrist'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[85, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='left_elbow',
+ id=6,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 7:
+ dict(
+ name='left_wrist',
+ id=7,
+ color=[0, 255, 85],
+ type='upper',
+ swap='right_wrist'),
+ 8:
+ dict(
+ name='right_hip',
+ id=8,
+ color=[0, 255, 170],
+ type='lower',
+ swap='left_hip'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[0, 255, 255],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='right_ankle',
+ id=10,
+ color=[0, 170, 255],
+ type='lower',
+ swap='left_ankle'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 85, 255],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='left_knee',
+ id=12,
+ color=[0, 0, 255],
+ type='lower',
+ swap='right_knee'),
+ 13:
+ dict(
+ name='left_ankle',
+ id=13,
+ color=[85, 0, 255],
+ type='lower',
+ swap='right_ankle'),
+ 14:
+ dict(
+ name='right_eye',
+ id=14,
+ color=[170, 0, 255],
+ type='upper',
+ swap='left_eye'),
+ 15:
+ dict(
+ name='left_eye',
+ id=15,
+ color=[255, 0, 255],
+ type='upper',
+ swap='right_eye'),
+ 16:
+ dict(
+ name='right_ear',
+ id=16,
+ color=[255, 0, 170],
+ type='upper',
+ swap='left_ear'),
+ 17:
+ dict(
+ name='left_ear',
+ id=17,
+ color=[255, 0, 85],
+ type='upper',
+ swap='right_ear'),
+ 18:
+ dict(
+ name='left_big_toe',
+ id=17,
+ color=[0, 0, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 19:
+ dict(
+ name='left_small_toe',
+ id=18,
+ color=[0, 0, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 20:
+ dict(
+ name='left_heel',
+ id=19,
+ color=[0, 0, 0],
+ type='lower',
+ swap='right_heel'),
+ 21:
+ dict(
+ name='right_big_toe',
+ id=20,
+ color=[0, 0, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 22:
+ dict(
+ name='right_small_toe',
+ id=21,
+ color=[0, 0, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 23:
+ dict(
+ name='right_heel',
+ id=22,
+ color=[0, 0, 0],
+ type='lower',
+ swap='left_heel'),
+ 24:
+ dict(
+ name='face-0',
+ id=23,
+ color=[255, 255, 255],
+ type='',
+ swap='face-16'),
+ 25:
+ dict(
+ name='face-1',
+ id=24,
+ color=[255, 255, 255],
+ type='',
+ swap='face-15'),
+ 26:
+ dict(
+ name='face-2',
+ id=25,
+ color=[255, 255, 255],
+ type='',
+ swap='face-14'),
+ 27:
+ dict(
+ name='face-3',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='face-13'),
+ 28:
+ dict(
+ name='face-4',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='face-12'),
+ 29:
+ dict(
+ name='face-5',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='face-11'),
+ 30:
+ dict(
+ name='face-6',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='face-10'),
+ 31:
+ dict(
+ name='face-7',
+ id=30,
+ color=[255, 255, 255],
+ type='',
+ swap='face-9'),
+ 32:
+ dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+ 33:
+ dict(
+ name='face-9',
+ id=32,
+ color=[255, 255, 255],
+ type='',
+ swap='face-7'),
+ 34:
+ dict(
+ name='face-10',
+ id=33,
+ color=[255, 255, 255],
+ type='',
+ swap='face-6'),
+ 35:
+ dict(
+ name='face-11',
+ id=34,
+ color=[255, 255, 255],
+ type='',
+ swap='face-5'),
+ 36:
+ dict(
+ name='face-12',
+ id=35,
+ color=[255, 255, 255],
+ type='',
+ swap='face-4'),
+ 37:
+ dict(
+ name='face-13',
+ id=36,
+ color=[255, 255, 255],
+ type='',
+ swap='face-3'),
+ 38:
+ dict(
+ name='face-14',
+ id=37,
+ color=[255, 255, 255],
+ type='',
+ swap='face-2'),
+ 39:
+ dict(
+ name='face-15',
+ id=38,
+ color=[255, 255, 255],
+ type='',
+ swap='face-1'),
+ 40:
+ dict(
+ name='face-16',
+ id=39,
+ color=[255, 255, 255],
+ type='',
+ swap='face-0'),
+ 41:
+ dict(
+ name='face-17',
+ id=40,
+ color=[255, 255, 255],
+ type='',
+ swap='face-26'),
+ 42:
+ dict(
+ name='face-18',
+ id=41,
+ color=[255, 255, 255],
+ type='',
+ swap='face-25'),
+ 43:
+ dict(
+ name='face-19',
+ id=42,
+ color=[255, 255, 255],
+ type='',
+ swap='face-24'),
+ 44:
+ dict(
+ name='face-20',
+ id=43,
+ color=[255, 255, 255],
+ type='',
+ swap='face-23'),
+ 45:
+ dict(
+ name='face-21',
+ id=44,
+ color=[255, 255, 255],
+ type='',
+ swap='face-22'),
+ 46:
+ dict(
+ name='face-22',
+ id=45,
+ color=[255, 255, 255],
+ type='',
+ swap='face-21'),
+ 47:
+ dict(
+ name='face-23',
+ id=46,
+ color=[255, 255, 255],
+ type='',
+ swap='face-20'),
+ 48:
+ dict(
+ name='face-24',
+ id=47,
+ color=[255, 255, 255],
+ type='',
+ swap='face-19'),
+ 49:
+ dict(
+ name='face-25',
+ id=48,
+ color=[255, 255, 255],
+ type='',
+ swap='face-18'),
+ 50:
+ dict(
+ name='face-26',
+ id=49,
+ color=[255, 255, 255],
+ type='',
+ swap='face-17'),
+ 51:
+ dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+ 52:
+ dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+ 53:
+ dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+ 54:
+ dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+ 55:
+ dict(
+ name='face-31',
+ id=54,
+ color=[255, 255, 255],
+ type='',
+ swap='face-35'),
+ 56:
+ dict(
+ name='face-32',
+ id=55,
+ color=[255, 255, 255],
+ type='',
+ swap='face-34'),
+ 57:
+ dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+ 58:
+ dict(
+ name='face-34',
+ id=57,
+ color=[255, 255, 255],
+ type='',
+ swap='face-32'),
+ 59:
+ dict(
+ name='face-35',
+ id=58,
+ color=[255, 255, 255],
+ type='',
+ swap='face-31'),
+ 60:
+ dict(
+ name='face-36',
+ id=59,
+ color=[255, 255, 255],
+ type='',
+ swap='face-45'),
+ 61:
+ dict(
+ name='face-37',
+ id=60,
+ color=[255, 255, 255],
+ type='',
+ swap='face-44'),
+ 62:
+ dict(
+ name='face-38',
+ id=61,
+ color=[255, 255, 255],
+ type='',
+ swap='face-43'),
+ 63:
+ dict(
+ name='face-39',
+ id=62,
+ color=[255, 255, 255],
+ type='',
+ swap='face-42'),
+ 64:
+ dict(
+ name='face-40',
+ id=63,
+ color=[255, 255, 255],
+ type='',
+ swap='face-47'),
+ 65:
+ dict(
+ name='face-41',
+ id=64,
+ color=[255, 255, 255],
+ type='',
+ swap='face-46'),
+ 66:
+ dict(
+ name='face-42',
+ id=65,
+ color=[255, 255, 255],
+ type='',
+ swap='face-39'),
+ 67:
+ dict(
+ name='face-43',
+ id=66,
+ color=[255, 255, 255],
+ type='',
+ swap='face-38'),
+ 68:
+ dict(
+ name='face-44',
+ id=67,
+ color=[255, 255, 255],
+ type='',
+ swap='face-37'),
+ 69:
+ dict(
+ name='face-45',
+ id=68,
+ color=[255, 255, 255],
+ type='',
+ swap='face-36'),
+ 70:
+ dict(
+ name='face-46',
+ id=69,
+ color=[255, 255, 255],
+ type='',
+ swap='face-41'),
+ 71:
+ dict(
+ name='face-47',
+ id=70,
+ color=[255, 255, 255],
+ type='',
+ swap='face-40'),
+ 72:
+ dict(
+ name='face-48',
+ id=71,
+ color=[255, 255, 255],
+ type='',
+ swap='face-54'),
+ 73:
+ dict(
+ name='face-49',
+ id=72,
+ color=[255, 255, 255],
+ type='',
+ swap='face-53'),
+ 74:
+ dict(
+ name='face-50',
+ id=73,
+ color=[255, 255, 255],
+ type='',
+ swap='face-52'),
+ 75:
+ dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+ 76:
+ dict(
+ name='face-52',
+ id=75,
+ color=[255, 255, 255],
+ type='',
+ swap='face-50'),
+ 77:
+ dict(
+ name='face-53',
+ id=76,
+ color=[255, 255, 255],
+ type='',
+ swap='face-49'),
+ 78:
+ dict(
+ name='face-54',
+ id=77,
+ color=[255, 255, 255],
+ type='',
+ swap='face-48'),
+ 79:
+ dict(
+ name='face-55',
+ id=78,
+ color=[255, 255, 255],
+ type='',
+ swap='face-59'),
+ 80:
+ dict(
+ name='face-56',
+ id=79,
+ color=[255, 255, 255],
+ type='',
+ swap='face-58'),
+ 81:
+ dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+ 82:
+ dict(
+ name='face-58',
+ id=81,
+ color=[255, 255, 255],
+ type='',
+ swap='face-56'),
+ 83:
+ dict(
+ name='face-59',
+ id=82,
+ color=[255, 255, 255],
+ type='',
+ swap='face-55'),
+ 84:
+ dict(
+ name='face-60',
+ id=83,
+ color=[255, 255, 255],
+ type='',
+ swap='face-64'),
+ 85:
+ dict(
+ name='face-61',
+ id=84,
+ color=[255, 255, 255],
+ type='',
+ swap='face-63'),
+ 86:
+ dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+ 87:
+ dict(
+ name='face-63',
+ id=86,
+ color=[255, 255, 255],
+ type='',
+ swap='face-61'),
+ 88:
+ dict(
+ name='face-64',
+ id=87,
+ color=[255, 255, 255],
+ type='',
+ swap='face-60'),
+ 89:
+ dict(
+ name='face-65',
+ id=88,
+ color=[255, 255, 255],
+ type='',
+ swap='face-67'),
+ 90:
+ dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+ 91:
+ dict(
+ name='face-67',
+ id=90,
+ color=[255, 255, 255],
+ type='',
+ swap='face-65'),
+ 92:
+ dict(
+ name='left_hand_root',
+ id=92,
+ color=[0, 0, 255],
+ type='',
+ swap='right_hand_root'),
+ 93:
+ dict(
+ name='left_thumb1',
+ id=93,
+ color=[0, 0, 255],
+ type='',
+ swap='right_thumb1'),
+ 94:
+ dict(
+ name='left_thumb2',
+ id=94,
+ color=[0, 0, 255],
+ type='',
+ swap='right_thumb2'),
+ 95:
+ dict(
+ name='left_thumb3',
+ id=95,
+ color=[0, 0, 255],
+ type='',
+ swap='right_thumb3'),
+ 96:
+ dict(
+ name='left_thumb4',
+ id=96,
+ color=[0, 0, 255],
+ type='',
+ swap='right_thumb4'),
+ 97:
+ dict(
+ name='left_forefinger1',
+ id=97,
+ color=[0, 0, 255],
+ type='',
+ swap='right_forefinger1'),
+ 98:
+ dict(
+ name='left_forefinger2',
+ id=98,
+ color=[0, 0, 255],
+ type='',
+ swap='right_forefinger2'),
+ 99:
+ dict(
+ name='left_forefinger3',
+ id=99,
+ color=[0, 0, 255],
+ type='',
+ swap='right_forefinger3'),
+ 100:
+ dict(
+ name='left_forefinger4',
+ id=100,
+ color=[0, 0, 255],
+ type='',
+ swap='right_forefinger4'),
+ 101:
+ dict(
+ name='left_middle_finger1',
+ id=101,
+ color=[0, 0, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 102:
+ dict(
+ name='left_middle_finger2',
+ id=102,
+ color=[0, 0, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 103:
+ dict(
+ name='left_middle_finger3',
+ id=103,
+ color=[0, 0, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 104:
+ dict(
+ name='left_middle_finger4',
+ id=104,
+ color=[0, 0, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 105:
+ dict(
+ name='left_ring_finger1',
+ id=105,
+ color=[0, 0, 255],
+ type='',
+ swap='right_ring_finger1'),
+ 106:
+ dict(
+ name='left_ring_finger2',
+ id=106,
+ color=[0, 0, 255],
+ type='',
+ swap='right_ring_finger2'),
+ 107:
+ dict(
+ name='left_ring_finger3',
+ id=107,
+ color=[0, 0, 255],
+ type='',
+ swap='right_ring_finger3'),
+ 108:
+ dict(
+ name='left_ring_finger4',
+ id=108,
+ color=[0, 0, 255],
+ type='',
+ swap='right_ring_finger4'),
+ 109:
+ dict(
+ name='left_pinky_finger1',
+ id=109,
+ color=[0, 0, 255],
+ type='',
+ swap='right_pinky_finger1'),
+ 110:
+ dict(
+ name='left_pinky_finger2',
+ id=110,
+ color=[0, 0, 255],
+ type='',
+ swap='right_pinky_finger2'),
+ 111:
+ dict(
+ name='left_pinky_finger3',
+ id=111,
+ color=[0, 0, 255],
+ type='',
+ swap='right_pinky_finger3'),
+ 112:
+ dict(
+ name='left_pinky_finger4',
+ id=112,
+ color=[0, 0, 255],
+ type='',
+ swap='right_pinky_finger4'),
+ 113:
+ dict(
+ name='right_hand_root',
+ id=113,
+ color=[0, 0, 255],
+ type='',
+ swap='left_hand_root'),
+ 114:
+ dict(
+ name='right_thumb1',
+ id=114,
+ color=[0, 0, 255],
+ type='',
+ swap='left_thumb1'),
+ 115:
+ dict(
+ name='right_thumb2',
+ id=115,
+ color=[0, 0, 255],
+ type='',
+ swap='left_thumb2'),
+ 116:
+ dict(
+ name='right_thumb3',
+ id=116,
+ color=[0, 0, 255],
+ type='',
+ swap='left_thumb3'),
+ 117:
+ dict(
+ name='right_thumb4',
+ id=117,
+ color=[0, 0, 255],
+ type='',
+ swap='left_thumb4'),
+ 118:
+ dict(
+ name='right_forefinger1',
+ id=118,
+ color=[0, 0, 255],
+ type='',
+ swap='left_forefinger1'),
+ 119:
+ dict(
+ name='right_forefinger2',
+ id=119,
+ color=[0, 0, 255],
+ type='',
+ swap='left_forefinger2'),
+ 120:
+ dict(
+ name='right_forefinger3',
+ id=120,
+ color=[0, 0, 255],
+ type='',
+ swap='left_forefinger3'),
+ 121:
+ dict(
+ name='right_forefinger4',
+ id=121,
+ color=[0, 0, 255],
+ type='',
+ swap='left_forefinger4'),
+ 122:
+ dict(
+ name='right_middle_finger1',
+ id=122,
+ color=[0, 0, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 123:
+ dict(
+ name='right_middle_finger2',
+ id=123,
+ color=[0, 0, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 124:
+ dict(
+ name='right_middle_finger3',
+ id=124,
+ color=[0, 0, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 125:
+ dict(
+ name='right_middle_finger4',
+ id=125,
+ color=[0, 0, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 126:
+ dict(
+ name='right_ring_finger1',
+ id=126,
+ color=[0, 0, 255],
+ type='',
+ swap='left_ring_finger1'),
+ 127:
+ dict(
+ name='right_ring_finger2',
+ id=127,
+ color=[0, 0, 255],
+ type='',
+ swap='left_ring_finger2'),
+ 128:
+ dict(
+ name='right_ring_finger3',
+ id=128,
+ color=[0, 0, 255],
+ type='',
+ swap='left_ring_finger3'),
+ 129:
+ dict(
+ name='right_ring_finger4',
+ id=129,
+ color=[0, 0, 255],
+ type='',
+ swap='left_ring_finger4'),
+ 130:
+ dict(
+ name='right_pinky_finger1',
+ id=130,
+ color=[0, 0, 255],
+ type='',
+ swap='left_pinky_finger1'),
+ 131:
+ dict(
+ name='right_pinky_finger2',
+ id=131,
+ color=[0, 0, 255],
+ type='',
+ swap='left_pinky_finger2'),
+ 132:
+ dict(
+ name='right_pinky_finger3',
+ id=132,
+ color=[0, 0, 255],
+ type='',
+ swap='left_pinky_finger3'),
+ 133:
+ dict(
+ name='right_pinky_finger4',
+ id=133,
+ color=[0, 0, 255],
+ type='',
+ swap='left_pinky_finger4')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('neck', 'right_shoulder'), id=0, color=[255, 0, 0]),
+ 1:
+ dict(link=('neck', 'left_shoulder'), id=1, color=[255, 85, 0]),
+ 2:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=2, color=[255, 170, 0]),
+ 3:
+ dict(link=('right_elbow', 'right_wrist'), id=3, color=[255, 255, 0]),
+ 4:
+ dict(link=('left_shoulder', 'left_elbow'), id=4, color=[170, 255, 0]),
+ 5:
+ dict(link=('left_elbow', 'left_wrist'), id=5, color=[85, 255, 0]),
+ 6:
+ dict(link=('neck', 'right_hip'), id=6, color=[0, 255, 0]),
+ 7:
+ dict(link=('right_hip', 'right_knee'), id=7, color=[0, 255, 85]),
+ 8:
+ dict(link=('right_knee', 'right_ankle'), id=8, color=[0, 255, 170]),
+ 9:
+ dict(link=('neck', 'left_hip'), id=9, color=[0, 255, 225]),
+ 10:
+ dict(link=('left_hip', 'left_knee'), id=10, color=[0, 170, 255]),
+ 11:
+ dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 85, 255]),
+ 12:
+ dict(link=('neck', 'nose'), id=12, color=[0, 0, 255]),
+ 13:
+ dict(link=('nose', 'right_eye'), id=13, color=[255, 0, 170]),
+ 14:
+ dict(link=('right_eye', 'right_ear'), id=14, color=[170, 0, 255]),
+ 15:
+ dict(link=('nose', 'left_eye'), id=15, color=[255, 0, 255]),
+ 16:
+ dict(link=('left_eye', 'left_ear'), id=16, color=[255, 0, 170]),
+ 17:
+ dict(link=('left_hand_root', 'left_thumb1'), id=17, color=[255, 0, 0]),
+ 18:
+ dict(link=('left_thumb1', 'left_thumb2'), id=18, color=[255, 76, 0]),
+ 19:
+ dict(link=('left_thumb2', 'left_thumb3'), id=19, color=[255, 153, 0]),
+ 20:
+ dict(link=('left_thumb3', 'left_thumb4'), id=20, color=[255, 230, 0]),
+ 21:
+ dict(
+ link=('left_hand_root', 'left_forefinger1'),
+ id=21,
+ color=[204, 255, 0]),
+ 22:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=22,
+ color=[128, 255, 0]),
+ 23:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=23,
+ color=[51, 255, 0]),
+ 24:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=24,
+ color=[0, 255, 26]),
+ 25:
+ dict(
+ link=('left_hand_root', 'left_middle_finger1'),
+ id=25,
+ color=[0, 255, 102]),
+ 26:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=26,
+ color=[0, 255, 178]),
+ 27:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=27,
+ color=[0, 255, 255]),
+ 28:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=28,
+ color=[0, 178, 255]),
+ 29:
+ dict(
+ link=('left_hand_root', 'left_ring_finger1'),
+ id=29,
+ color=[0, 102, 255]),
+ 30:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=30,
+ color=[0, 26, 255]),
+ 31:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=31,
+ color=[51, 0, 255]),
+ 32:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=32,
+ color=[128, 0, 255]),
+ 33:
+ dict(
+ link=('left_hand_root', 'left_pinky_finger1'),
+ id=33,
+ color=[204, 0, 255]),
+ 34:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=34,
+ color=[255, 0, 230]),
+ 35:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=35,
+ color=[255, 0, 153]),
+ 36:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=36,
+ color=[255, 0, 76]),
+ 37:
+ dict(
+ link=('right_hand_root', 'right_thumb1'), id=37, color=[255, 0,
+ 0]),
+ 38:
+ dict(link=('right_thumb1', 'right_thumb2'), id=38, color=[255, 76, 0]),
+ 39:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=39, color=[255, 153, 0]),
+ 40:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=40, color=[255, 230, 0]),
+ 41:
+ dict(
+ link=('right_hand_root', 'right_forefinger1'),
+ id=41,
+ color=[204, 255, 0]),
+ 42:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=42,
+ color=[128, 255, 0]),
+ 43:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=43,
+ color=[51, 255, 0]),
+ 44:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=44,
+ color=[0, 255, 26]),
+ 45:
+ dict(
+ link=('right_hand_root', 'right_middle_finger1'),
+ id=45,
+ color=[0, 255, 102]),
+ 46:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=46,
+ color=[0, 255, 178]),
+ 47:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=47,
+ color=[255, 255, 255]),
+ 48:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=48,
+ color=[0, 178, 255]),
+ 49:
+ dict(
+ link=('right_hand_root', 'right_ring_finger1'),
+ id=49,
+ color=[0, 102, 255]),
+ 50:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=50,
+ color=[0, 26, 255]),
+ 51:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=51,
+ color=[51, 0, 255]),
+ 52:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=52,
+ color=[128, 0, 255]),
+ 53:
+ dict(
+ link=('right_hand_root', 'right_pinky_finger1'),
+ id=53,
+ color=[204, 0, 255]),
+ 54:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=54,
+ color=[255, 0, 230]),
+ 55:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=55,
+ color=[255, 0, 153]),
+ 56:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=56,
+ color=[255, 0, 76])
+ },
+ joint_weights=[1.] * 134,
+ # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+ # 'evaluation/myeval_wholebody.py#L175'
+ sigmas=[
+ 0.026, 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+ 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066,
+ 0.066, 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035,
+ 0.031, 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041,
+ 0.045, 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013,
+ 0.015, 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010,
+ 0.017, 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012,
+ 0.010, 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008,
+ 0.009, 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008,
+ 0.01, 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+ 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+ 0.019, 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025,
+ 0.024, 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032,
+ 0.02, 0.019, 0.022, 0.031
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/cofw.py b/modules/rtmpose/configs/_base_/datasets/cofw.py
new file mode 100644
index 0000000..8879254
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/cofw.py
@@ -0,0 +1,57 @@
+dataset_info = dict(
+ dataset_name='cofw',
+ paper_info=dict(
+ author='Burgos-Artizzu, Xavier P and Perona, '
+ r'Pietro and Doll{\'a}r, Piotr',
+ title='Robust face landmark estimation under occlusion',
+ container='Proceedings of the IEEE international '
+ 'conference on computer vision',
+ year='2013',
+ homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-1'),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-0'),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-6'),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-7'),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-4'),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-5'),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-11'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-10'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'),
+ 13:
+ dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-15'),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-13'),
+ 16:
+ dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-17'),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-16'),
+ 18:
+ dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-19'),
+ 19:
+ dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-18'),
+ 20: dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap=''),
+ 21: dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap=''),
+ 22:
+ dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-23'),
+ 23:
+ dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-22'),
+ 24: dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap=''),
+ 25: dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap=''),
+ 26: dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap=''),
+ 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''),
+ 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='')
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 29,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/crowdpose.py b/modules/rtmpose/configs/_base_/datasets/crowdpose.py
new file mode 100644
index 0000000..358d36f
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/crowdpose.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+ dataset_name='crowdpose',
+ paper_info=dict(
+ author='Li, Jiefeng and Wang, Can and Zhu, Hao and '
+ 'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu',
+ title='CrowdPose: Efficient Crowded Scenes Pose Estimation '
+ 'and A New Benchmark',
+ container='Proceedings of IEEE Conference on Computer '
+ 'Vision and Pattern Recognition (CVPR)',
+ year='2019',
+ homepage='https://github.com/Jeff-sjtu/CrowdPose',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left_shoulder',
+ id=0,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_shoulder'),
+ 1:
+ dict(
+ name='right_shoulder',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_shoulder'),
+ 2:
+ dict(
+ name='left_elbow',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_elbow'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='left_wrist',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_wrist'),
+ 5:
+ dict(
+ name='right_wrist',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='left_wrist'),
+ 6:
+ dict(
+ name='left_hip',
+ id=6,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_hip'),
+ 7:
+ dict(
+ name='right_hip',
+ id=7,
+ color=[0, 255, 0],
+ type='lower',
+ swap='left_hip'),
+ 8:
+ dict(
+ name='left_knee',
+ id=8,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_knee'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[0, 255, 0],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='left_ankle',
+ id=10,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_ankle'),
+ 11:
+ dict(
+ name='right_ankle',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='left_ankle'),
+ 12:
+ dict(
+ name='top_head', id=12, color=[255, 128, 0], type='upper',
+ swap=''),
+ 13:
+ dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+ ],
+ sigmas=[
+ 0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+ 0.089, 0.089, 0.079, 0.079
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion2.py b/modules/rtmpose/configs/_base_/datasets/deepfashion2.py
new file mode 100644
index 0000000..de7004e
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/deepfashion2.py
@@ -0,0 +1,2660 @@
+colors = dict(
+ sss=[255, 128, 0], # short_sleeve_shirt
+ lss=[255, 0, 128], # long_sleeved_shirt
+ sso=[128, 0, 255], # short_sleeved_outwear
+ lso=[0, 128, 255], # long_sleeved_outwear
+ vest=[0, 128, 128], # vest
+ sling=[0, 0, 128], # sling
+ shorts=[128, 128, 128], # shorts
+ trousers=[128, 0, 128], # trousers
+ skirt=[64, 128, 128], # skirt
+ ssd=[64, 64, 128], # short_sleeved_dress
+ lsd=[128, 64, 0], # long_sleeved_dress
+ vd=[128, 64, 255], # vest_dress
+ sd=[128, 64, 0], # sling_dress
+)
+dataset_info = dict(
+ dataset_name='deepfashion2',
+ paper_info=dict(
+ author='Yuying Ge and Ruimao Zhang and Lingyun Wu '
+ 'and Xiaogang Wang and Xiaoou Tang and Ping Luo',
+ title='DeepFashion2: A Versatile Benchmark for '
+ 'Detection, Pose Estimation, Segmentation and '
+ 'Re-Identification of Clothing Images',
+ container='Proceedings of IEEE Conference on Computer '
+ 'Vision and Pattern Recognition (CVPR)',
+ year='2019',
+ homepage='https://github.com/switchablenorms/DeepFashion2',
+ ),
+ keypoint_info={
+ # short_sleeved_shirt
+ 0:
+ dict(name='sss_kpt1', id=0, color=colors['sss'], type='', swap=''),
+ 1:
+ dict(
+ name='sss_kpt2',
+ id=1,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt6'),
+ 2:
+ dict(
+ name='sss_kpt3',
+ id=2,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt5'),
+ 3:
+ dict(name='sss_kpt4', id=3, color=colors['sss'], type='', swap=''),
+ 4:
+ dict(
+ name='sss_kpt5',
+ id=4,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt3'),
+ 5:
+ dict(
+ name='sss_kpt6',
+ id=5,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt2'),
+ 6:
+ dict(
+ name='sss_kpt7',
+ id=6,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt25'),
+ 7:
+ dict(
+ name='sss_kpt8',
+ id=7,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt24'),
+ 8:
+ dict(
+ name='sss_kpt9',
+ id=8,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt23'),
+ 9:
+ dict(
+ name='sss_kpt10',
+ id=9,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt22'),
+ 10:
+ dict(
+ name='sss_kpt11',
+ id=10,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt21'),
+ 11:
+ dict(
+ name='sss_kpt12',
+ id=11,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt20'),
+ 12:
+ dict(
+ name='sss_kpt13',
+ id=12,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt19'),
+ 13:
+ dict(
+ name='sss_kpt14',
+ id=13,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt18'),
+ 14:
+ dict(
+ name='sss_kpt15',
+ id=14,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt17'),
+ 15:
+ dict(name='sss_kpt16', id=15, color=colors['sss'], type='', swap=''),
+ 16:
+ dict(
+ name='sss_kpt17',
+ id=16,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt15'),
+ 17:
+ dict(
+ name='sss_kpt18',
+ id=17,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt14'),
+ 18:
+ dict(
+ name='sss_kpt19',
+ id=18,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt13'),
+ 19:
+ dict(
+ name='sss_kpt20',
+ id=19,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt12'),
+ 20:
+ dict(
+ name='sss_kpt21',
+ id=20,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt11'),
+ 21:
+ dict(
+ name='sss_kpt22',
+ id=21,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt10'),
+ 22:
+ dict(
+ name='sss_kpt23',
+ id=22,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt9'),
+ 23:
+ dict(
+ name='sss_kpt24',
+ id=23,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt8'),
+ 24:
+ dict(
+ name='sss_kpt25',
+ id=24,
+ color=colors['sss'],
+ type='',
+ swap='sss_kpt7'),
+ # long_sleeved_shirt
+ 25:
+ dict(name='lss_kpt1', id=25, color=colors['lss'], type='', swap=''),
+ 26:
+ dict(
+ name='lss_kpt2',
+ id=26,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt6'),
+ 27:
+ dict(
+ name='lss_kpt3',
+ id=27,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt5'),
+ 28:
+ dict(name='lss_kpt4', id=28, color=colors['lss'], type='', swap=''),
+ 29:
+ dict(
+ name='lss_kpt5',
+ id=29,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt3'),
+ 30:
+ dict(
+ name='lss_kpt6',
+ id=30,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt2'),
+ 31:
+ dict(
+ name='lss_kpt7',
+ id=31,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt33'),
+ 32:
+ dict(
+ name='lss_kpt8',
+ id=32,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt32'),
+ 33:
+ dict(
+ name='lss_kpt9',
+ id=33,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt31'),
+ 34:
+ dict(
+ name='lss_kpt10',
+ id=34,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt30'),
+ 35:
+ dict(
+ name='lss_kpt11',
+ id=35,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt29'),
+ 36:
+ dict(
+ name='lss_kpt12',
+ id=36,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt28'),
+ 37:
+ dict(
+ name='lss_kpt13',
+ id=37,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt27'),
+ 38:
+ dict(
+ name='lss_kpt14',
+ id=38,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt26'),
+ 39:
+ dict(
+ name='lss_kpt15',
+ id=39,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt25'),
+ 40:
+ dict(
+ name='lss_kpt16',
+ id=40,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt24'),
+ 41:
+ dict(
+ name='lss_kpt17',
+ id=41,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt23'),
+ 42:
+ dict(
+ name='lss_kpt18',
+ id=42,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt22'),
+ 43:
+ dict(
+ name='lss_kpt19',
+ id=43,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt21'),
+ 44:
+ dict(name='lss_kpt20', id=44, color=colors['lss'], type='', swap=''),
+ 45:
+ dict(
+ name='lss_kpt21',
+ id=45,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt19'),
+ 46:
+ dict(
+ name='lss_kpt22',
+ id=46,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt18'),
+ 47:
+ dict(
+ name='lss_kpt23',
+ id=47,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt17'),
+ 48:
+ dict(
+ name='lss_kpt24',
+ id=48,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt16'),
+ 49:
+ dict(
+ name='lss_kpt25',
+ id=49,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt15'),
+ 50:
+ dict(
+ name='lss_kpt26',
+ id=50,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt14'),
+ 51:
+ dict(
+ name='lss_kpt27',
+ id=51,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt13'),
+ 52:
+ dict(
+ name='lss_kpt28',
+ id=52,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt12'),
+ 53:
+ dict(
+ name='lss_kpt29',
+ id=53,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt11'),
+ 54:
+ dict(
+ name='lss_kpt30',
+ id=54,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt10'),
+ 55:
+ dict(
+ name='lss_kpt31',
+ id=55,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt9'),
+ 56:
+ dict(
+ name='lss_kpt32',
+ id=56,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt8'),
+ 57:
+ dict(
+ name='lss_kpt33',
+ id=57,
+ color=colors['lss'],
+ type='',
+ swap='lss_kpt7'),
+ # short_sleeved_outwear
+ 58:
+ dict(name='sso_kpt1', id=58, color=colors['sso'], type='', swap=''),
+ 59:
+ dict(
+ name='sso_kpt2',
+ id=59,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt26'),
+ 60:
+ dict(
+ name='sso_kpt3',
+ id=60,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt5'),
+ 61:
+ dict(
+ name='sso_kpt4',
+ id=61,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt6'),
+ 62:
+ dict(
+ name='sso_kpt5',
+ id=62,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt3'),
+ 63:
+ dict(
+ name='sso_kpt6',
+ id=63,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt4'),
+ 64:
+ dict(
+ name='sso_kpt7',
+ id=64,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt25'),
+ 65:
+ dict(
+ name='sso_kpt8',
+ id=65,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt24'),
+ 66:
+ dict(
+ name='sso_kpt9',
+ id=66,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt23'),
+ 67:
+ dict(
+ name='sso_kpt10',
+ id=67,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt22'),
+ 68:
+ dict(
+ name='sso_kpt11',
+ id=68,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt21'),
+ 69:
+ dict(
+ name='sso_kpt12',
+ id=69,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt20'),
+ 70:
+ dict(
+ name='sso_kpt13',
+ id=70,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt19'),
+ 71:
+ dict(
+ name='sso_kpt14',
+ id=71,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt18'),
+ 72:
+ dict(
+ name='sso_kpt15',
+ id=72,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt17'),
+ 73:
+ dict(
+ name='sso_kpt16',
+ id=73,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt29'),
+ 74:
+ dict(
+ name='sso_kpt17',
+ id=74,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt15'),
+ 75:
+ dict(
+ name='sso_kpt18',
+ id=75,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt14'),
+ 76:
+ dict(
+ name='sso_kpt19',
+ id=76,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt13'),
+ 77:
+ dict(
+ name='sso_kpt20',
+ id=77,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt12'),
+ 78:
+ dict(
+ name='sso_kpt21',
+ id=78,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt11'),
+ 79:
+ dict(
+ name='sso_kpt22',
+ id=79,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt10'),
+ 80:
+ dict(
+ name='sso_kpt23',
+ id=80,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt9'),
+ 81:
+ dict(
+ name='sso_kpt24',
+ id=81,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt8'),
+ 82:
+ dict(
+ name='sso_kpt25',
+ id=82,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt7'),
+ 83:
+ dict(
+ name='sso_kpt26',
+ id=83,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt2'),
+ 84:
+ dict(
+ name='sso_kpt27',
+ id=84,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt30'),
+ 85:
+ dict(
+ name='sso_kpt28',
+ id=85,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt31'),
+ 86:
+ dict(
+ name='sso_kpt29',
+ id=86,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt16'),
+ 87:
+ dict(
+ name='sso_kpt30',
+ id=87,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt27'),
+ 88:
+ dict(
+ name='sso_kpt31',
+ id=88,
+ color=colors['sso'],
+ type='',
+ swap='sso_kpt28'),
+ # long_sleeved_outwear
+ 89:
+ dict(name='lso_kpt1', id=89, color=colors['lso'], type='', swap=''),
+ 90:
+ dict(
+ name='lso_kpt2',
+ id=90,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt6'),
+ 91:
+ dict(
+ name='lso_kpt3',
+ id=91,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt5'),
+ 92:
+ dict(
+ name='lso_kpt4',
+ id=92,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt34'),
+ 93:
+ dict(
+ name='lso_kpt5',
+ id=93,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt3'),
+ 94:
+ dict(
+ name='lso_kpt6',
+ id=94,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt2'),
+ 95:
+ dict(
+ name='lso_kpt7',
+ id=95,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt33'),
+ 96:
+ dict(
+ name='lso_kpt8',
+ id=96,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt32'),
+ 97:
+ dict(
+ name='lso_kpt9',
+ id=97,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt31'),
+ 98:
+ dict(
+ name='lso_kpt10',
+ id=98,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt30'),
+ 99:
+ dict(
+ name='lso_kpt11',
+ id=99,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt29'),
+ 100:
+ dict(
+ name='lso_kpt12',
+ id=100,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt28'),
+ 101:
+ dict(
+ name='lso_kpt13',
+ id=101,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt27'),
+ 102:
+ dict(
+ name='lso_kpt14',
+ id=102,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt26'),
+ 103:
+ dict(
+ name='lso_kpt15',
+ id=103,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt25'),
+ 104:
+ dict(
+ name='lso_kpt16',
+ id=104,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt24'),
+ 105:
+ dict(
+ name='lso_kpt17',
+ id=105,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt23'),
+ 106:
+ dict(
+ name='lso_kpt18',
+ id=106,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt22'),
+ 107:
+ dict(
+ name='lso_kpt19',
+ id=107,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt21'),
+ 108:
+ dict(
+ name='lso_kpt20',
+ id=108,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt37'),
+ 109:
+ dict(
+ name='lso_kpt21',
+ id=109,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt19'),
+ 110:
+ dict(
+ name='lso_kpt22',
+ id=110,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt18'),
+ 111:
+ dict(
+ name='lso_kpt23',
+ id=111,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt17'),
+ 112:
+ dict(
+ name='lso_kpt24',
+ id=112,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt16'),
+ 113:
+ dict(
+ name='lso_kpt25',
+ id=113,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt15'),
+ 114:
+ dict(
+ name='lso_kpt26',
+ id=114,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt14'),
+ 115:
+ dict(
+ name='lso_kpt27',
+ id=115,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt13'),
+ 116:
+ dict(
+ name='lso_kpt28',
+ id=116,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt12'),
+ 117:
+ dict(
+ name='lso_kpt29',
+ id=117,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt11'),
+ 118:
+ dict(
+ name='lso_kpt30',
+ id=118,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt10'),
+ 119:
+ dict(
+ name='lso_kpt31',
+ id=119,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt9'),
+ 120:
+ dict(
+ name='lso_kpt32',
+ id=120,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt8'),
+ 121:
+ dict(
+ name='lso_kpt33',
+ id=121,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt7'),
+ 122:
+ dict(
+ name='lso_kpt34',
+ id=122,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt4'),
+ 123:
+ dict(
+ name='lso_kpt35',
+ id=123,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt38'),
+ 124:
+ dict(
+ name='lso_kpt36',
+ id=124,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt39'),
+ 125:
+ dict(
+ name='lso_kpt37',
+ id=125,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt20'),
+ 126:
+ dict(
+ name='lso_kpt38',
+ id=126,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt35'),
+ 127:
+ dict(
+ name='lso_kpt39',
+ id=127,
+ color=colors['lso'],
+ type='',
+ swap='lso_kpt36'),
+ # vest
+ 128:
+ dict(name='vest_kpt1', id=128, color=colors['vest'], type='', swap=''),
+ 129:
+ dict(
+ name='vest_kpt2',
+ id=129,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt6'),
+ 130:
+ dict(
+ name='vest_kpt3',
+ id=130,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt5'),
+ 131:
+ dict(name='vest_kpt4', id=131, color=colors['vest'], type='', swap=''),
+ 132:
+ dict(
+ name='vest_kpt5',
+ id=132,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt3'),
+ 133:
+ dict(
+ name='vest_kpt6',
+ id=133,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt2'),
+ 134:
+ dict(
+ name='vest_kpt7',
+ id=134,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt15'),
+ 135:
+ dict(
+ name='vest_kpt8',
+ id=135,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt14'),
+ 136:
+ dict(
+ name='vest_kpt9',
+ id=136,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt13'),
+ 137:
+ dict(
+ name='vest_kpt10',
+ id=137,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt12'),
+ 138:
+ dict(
+ name='vest_kpt11', id=138, color=colors['vest'], type='', swap=''),
+ 139:
+ dict(
+ name='vest_kpt12',
+ id=139,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt10'),
+ 140:
+ dict(
+ name='vest_kpt13', id=140, color=colors['vest'], type='', swap=''),
+ 141:
+ dict(
+ name='vest_kpt14',
+ id=141,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt8'),
+ 142:
+ dict(
+ name='vest_kpt15',
+ id=142,
+ color=colors['vest'],
+ type='',
+ swap='vest_kpt7'),
+ # sling
+ 143:
+ dict(
+ name='sling_kpt1', id=143, color=colors['sling'], type='',
+ swap=''),
+ 144:
+ dict(
+ name='sling_kpt2',
+ id=144,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt6'),
+ 145:
+ dict(
+ name='sling_kpt3',
+ id=145,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt5'),
+ 146:
+ dict(
+ name='sling_kpt4', id=146, color=colors['sling'], type='',
+ swap=''),
+ 147:
+ dict(
+ name='sling_kpt5',
+ id=147,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt3'),
+ 148:
+ dict(
+ name='sling_kpt6',
+ id=148,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt2'),
+ 149:
+ dict(
+ name='sling_kpt7',
+ id=149,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt15'),
+ 150:
+ dict(
+ name='sling_kpt8',
+ id=150,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt14'),
+ 151:
+ dict(
+ name='sling_kpt9',
+ id=151,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt13'),
+ 152:
+ dict(
+ name='sling_kpt10',
+ id=152,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt12'),
+ 153:
+ dict(
+ name='sling_kpt11',
+ id=153,
+ color=colors['sling'],
+ type='',
+ swap=''),
+ 154:
+ dict(
+ name='sling_kpt12',
+ id=154,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt10'),
+ 155:
+ dict(
+ name='sling_kpt13',
+ id=155,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt9'),
+ 156:
+ dict(
+ name='sling_kpt14',
+ id=156,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt8'),
+ 157:
+ dict(
+ name='sling_kpt15',
+ id=157,
+ color=colors['sling'],
+ type='',
+ swap='sling_kpt7'),
+ # shorts
+ 158:
+ dict(
+ name='shorts_kpt1',
+ id=158,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt3'),
+ 159:
+ dict(
+ name='shorts_kpt2',
+ id=159,
+ color=colors['shorts'],
+ type='',
+ swap=''),
+ 160:
+ dict(
+ name='shorts_kpt3',
+ id=160,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt1'),
+ 161:
+ dict(
+ name='shorts_kpt4',
+ id=161,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt10'),
+ 162:
+ dict(
+ name='shorts_kpt5',
+ id=162,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt9'),
+ 163:
+ dict(
+ name='shorts_kpt6',
+ id=163,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt8'),
+ 164:
+ dict(
+ name='shorts_kpt7',
+ id=164,
+ color=colors['shorts'],
+ type='',
+ swap=''),
+ 165:
+ dict(
+ name='shorts_kpt8',
+ id=165,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt6'),
+ 166:
+ dict(
+ name='shorts_kpt9',
+ id=166,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt5'),
+ 167:
+ dict(
+ name='shorts_kpt10',
+ id=167,
+ color=colors['shorts'],
+ type='',
+ swap='shorts_kpt4'),
+ # trousers
+ 168:
+ dict(
+ name='trousers_kpt1',
+ id=168,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt3'),
+ 169:
+ dict(
+ name='trousers_kpt2',
+ id=169,
+ color=colors['trousers'],
+ type='',
+ swap=''),
+ 170:
+ dict(
+ name='trousers_kpt3',
+ id=170,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt1'),
+ 171:
+ dict(
+ name='trousers_kpt4',
+ id=171,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt14'),
+ 172:
+ dict(
+ name='trousers_kpt5',
+ id=172,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt13'),
+ 173:
+ dict(
+ name='trousers_kpt6',
+ id=173,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt12'),
+ 174:
+ dict(
+ name='trousers_kpt7',
+ id=174,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt11'),
+ 175:
+ dict(
+ name='trousers_kpt8',
+ id=175,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt10'),
+ 176:
+ dict(
+ name='trousers_kpt9',
+ id=176,
+ color=colors['trousers'],
+ type='',
+ swap=''),
+ 177:
+ dict(
+ name='trousers_kpt10',
+ id=177,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt8'),
+ 178:
+ dict(
+ name='trousers_kpt11',
+ id=178,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt7'),
+ 179:
+ dict(
+ name='trousers_kpt12',
+ id=179,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt6'),
+ 180:
+ dict(
+ name='trousers_kpt13',
+ id=180,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt5'),
+ 181:
+ dict(
+ name='trousers_kpt14',
+ id=181,
+ color=colors['trousers'],
+ type='',
+ swap='trousers_kpt4'),
+ # skirt
+ 182:
+ dict(
+ name='skirt_kpt1',
+ id=182,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt3'),
+ 183:
+ dict(
+ name='skirt_kpt2', id=183, color=colors['skirt'], type='',
+ swap=''),
+ 184:
+ dict(
+ name='skirt_kpt3',
+ id=184,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt1'),
+ 185:
+ dict(
+ name='skirt_kpt4',
+ id=185,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt8'),
+ 186:
+ dict(
+ name='skirt_kpt5',
+ id=186,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt7'),
+ 187:
+ dict(
+ name='skirt_kpt6', id=187, color=colors['skirt'], type='',
+ swap=''),
+ 188:
+ dict(
+ name='skirt_kpt7',
+ id=188,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt5'),
+ 189:
+ dict(
+ name='skirt_kpt8',
+ id=189,
+ color=colors['skirt'],
+ type='',
+ swap='skirt_kpt4'),
+ # short_sleeved_dress
+ 190:
+ dict(name='ssd_kpt1', id=190, color=colors['ssd'], type='', swap=''),
+ 191:
+ dict(
+ name='ssd_kpt2',
+ id=191,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt6'),
+ 192:
+ dict(
+ name='ssd_kpt3',
+ id=192,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt5'),
+ 193:
+ dict(name='ssd_kpt4', id=193, color=colors['ssd'], type='', swap=''),
+ 194:
+ dict(
+ name='ssd_kpt5',
+ id=194,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt3'),
+ 195:
+ dict(
+ name='ssd_kpt6',
+ id=195,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt2'),
+ 196:
+ dict(
+ name='ssd_kpt7',
+ id=196,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt29'),
+ 197:
+ dict(
+ name='ssd_kpt8',
+ id=197,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt28'),
+ 198:
+ dict(
+ name='ssd_kpt9',
+ id=198,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt27'),
+ 199:
+ dict(
+ name='ssd_kpt10',
+ id=199,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt26'),
+ 200:
+ dict(
+ name='ssd_kpt11',
+ id=200,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt25'),
+ 201:
+ dict(
+ name='ssd_kpt12',
+ id=201,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt24'),
+ 202:
+ dict(
+ name='ssd_kpt13',
+ id=202,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt23'),
+ 203:
+ dict(
+ name='ssd_kpt14',
+ id=203,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt22'),
+ 204:
+ dict(
+ name='ssd_kpt15',
+ id=204,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt21'),
+ 205:
+ dict(
+ name='ssd_kpt16',
+ id=205,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt20'),
+ 206:
+ dict(
+ name='ssd_kpt17',
+ id=206,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt19'),
+ 207:
+ dict(name='ssd_kpt18', id=207, color=colors['ssd'], type='', swap=''),
+ 208:
+ dict(
+ name='ssd_kpt19',
+ id=208,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt17'),
+ 209:
+ dict(
+ name='ssd_kpt20',
+ id=209,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt16'),
+ 210:
+ dict(
+ name='ssd_kpt21',
+ id=210,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt15'),
+ 211:
+ dict(
+ name='ssd_kpt22',
+ id=211,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt14'),
+ 212:
+ dict(
+ name='ssd_kpt23',
+ id=212,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt13'),
+ 213:
+ dict(
+ name='ssd_kpt24',
+ id=213,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt12'),
+ 214:
+ dict(
+ name='ssd_kpt25',
+ id=214,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt11'),
+ 215:
+ dict(
+ name='ssd_kpt26',
+ id=215,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt10'),
+ 216:
+ dict(
+ name='ssd_kpt27',
+ id=216,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt9'),
+ 217:
+ dict(
+ name='ssd_kpt28',
+ id=217,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt8'),
+ 218:
+ dict(
+ name='ssd_kpt29',
+ id=218,
+ color=colors['ssd'],
+ type='',
+ swap='ssd_kpt7'),
+ # long_sleeved_dress
+ 219:
+ dict(name='lsd_kpt1', id=219, color=colors['lsd'], type='', swap=''),
+ 220:
+ dict(
+ name='lsd_kpt2',
+ id=220,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt6'),
+ 221:
+ dict(
+ name='lsd_kpt3',
+ id=221,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt5'),
+ 222:
+ dict(name='lsd_kpt4', id=222, color=colors['lsd'], type='', swap=''),
+ 223:
+ dict(
+ name='lsd_kpt5',
+ id=223,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt3'),
+ 224:
+ dict(
+ name='lsd_kpt6',
+ id=224,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt2'),
+ 225:
+ dict(
+ name='lsd_kpt7',
+ id=225,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt37'),
+ 226:
+ dict(
+ name='lsd_kpt8',
+ id=226,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt36'),
+ 227:
+ dict(
+ name='lsd_kpt9',
+ id=227,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt35'),
+ 228:
+ dict(
+ name='lsd_kpt10',
+ id=228,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt34'),
+ 229:
+ dict(
+ name='lsd_kpt11',
+ id=229,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt33'),
+ 230:
+ dict(
+ name='lsd_kpt12',
+ id=230,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt32'),
+ 231:
+ dict(
+ name='lsd_kpt13',
+ id=231,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt31'),
+ 232:
+ dict(
+ name='lsd_kpt14',
+ id=232,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt30'),
+ 233:
+ dict(
+ name='lsd_kpt15',
+ id=233,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt29'),
+ 234:
+ dict(
+ name='lsd_kpt16',
+ id=234,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt28'),
+ 235:
+ dict(
+ name='lsd_kpt17',
+ id=235,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt27'),
+ 236:
+ dict(
+ name='lsd_kpt18',
+ id=236,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt26'),
+ 237:
+ dict(
+ name='lsd_kpt19',
+ id=237,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt25'),
+ 238:
+ dict(
+ name='lsd_kpt20',
+ id=238,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt24'),
+ 239:
+ dict(
+ name='lsd_kpt21',
+ id=239,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt23'),
+ 240:
+ dict(name='lsd_kpt22', id=240, color=colors['lsd'], type='', swap=''),
+ 241:
+ dict(
+ name='lsd_kpt23',
+ id=241,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt21'),
+ 242:
+ dict(
+ name='lsd_kpt24',
+ id=242,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt20'),
+ 243:
+ dict(
+ name='lsd_kpt25',
+ id=243,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt19'),
+ 244:
+ dict(
+ name='lsd_kpt26',
+ id=244,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt18'),
+ 245:
+ dict(
+ name='lsd_kpt27',
+ id=245,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt17'),
+ 246:
+ dict(
+ name='lsd_kpt28',
+ id=246,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt16'),
+ 247:
+ dict(
+ name='lsd_kpt29',
+ id=247,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt15'),
+ 248:
+ dict(
+ name='lsd_kpt30',
+ id=248,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt14'),
+ 249:
+ dict(
+ name='lsd_kpt31',
+ id=249,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt13'),
+ 250:
+ dict(
+ name='lsd_kpt32',
+ id=250,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt12'),
+ 251:
+ dict(
+ name='lsd_kpt33',
+ id=251,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt11'),
+ 252:
+ dict(
+ name='lsd_kpt34',
+ id=252,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt10'),
+ 253:
+ dict(
+ name='lsd_kpt35',
+ id=253,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt9'),
+ 254:
+ dict(
+ name='lsd_kpt36',
+ id=254,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt8'),
+ 255:
+ dict(
+ name='lsd_kpt37',
+ id=255,
+ color=colors['lsd'],
+ type='',
+ swap='lsd_kpt7'),
+ # vest_dress
+ 256:
+ dict(name='vd_kpt1', id=256, color=colors['vd'], type='', swap=''),
+ 257:
+ dict(
+ name='vd_kpt2',
+ id=257,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt6'),
+ 258:
+ dict(
+ name='vd_kpt3',
+ id=258,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt5'),
+ 259:
+ dict(name='vd_kpt4', id=259, color=colors['vd'], type='', swap=''),
+ 260:
+ dict(
+ name='vd_kpt5',
+ id=260,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt3'),
+ 261:
+ dict(
+ name='vd_kpt6',
+ id=261,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt2'),
+ 262:
+ dict(
+ name='vd_kpt7',
+ id=262,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt19'),
+ 263:
+ dict(
+ name='vd_kpt8',
+ id=263,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt18'),
+ 264:
+ dict(
+ name='vd_kpt9',
+ id=264,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt17'),
+ 265:
+ dict(
+ name='vd_kpt10',
+ id=265,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt16'),
+ 266:
+ dict(
+ name='vd_kpt11',
+ id=266,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt15'),
+ 267:
+ dict(
+ name='vd_kpt12',
+ id=267,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt14'),
+ 268:
+ dict(name='vd_kpt13', id=268, color=colors['vd'], type='', swap=''),
+ 269:
+ dict(
+ name='vd_kpt14',
+ id=269,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt12'),
+ 270:
+ dict(
+ name='vd_kpt15',
+ id=270,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt11'),
+ 271:
+ dict(
+ name='vd_kpt16',
+ id=271,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt10'),
+ 272:
+ dict(
+ name='vd_kpt17',
+ id=272,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt9'),
+ 273:
+ dict(
+ name='vd_kpt18',
+ id=273,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt8'),
+ 274:
+ dict(
+ name='vd_kpt19',
+ id=274,
+ color=colors['vd'],
+ type='',
+ swap='vd_kpt7'),
+ # sling_dress
+ 275:
+ dict(name='sd_kpt1', id=275, color=colors['sd'], type='', swap=''),
+ 276:
+ dict(
+ name='sd_kpt2',
+ id=276,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt6'),
+ 277:
+ dict(
+ name='sd_kpt3',
+ id=277,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt5'),
+ 278:
+ dict(name='sd_kpt4', id=278, color=colors['sd'], type='', swap=''),
+ 279:
+ dict(
+ name='sd_kpt5',
+ id=279,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt3'),
+ 280:
+ dict(
+ name='sd_kpt6',
+ id=280,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt2'),
+ 281:
+ dict(
+ name='sd_kpt7',
+ id=281,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt19'),
+ 282:
+ dict(
+ name='sd_kpt8',
+ id=282,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt18'),
+ 283:
+ dict(
+ name='sd_kpt9',
+ id=283,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt17'),
+ 284:
+ dict(
+ name='sd_kpt10',
+ id=284,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt16'),
+ 285:
+ dict(
+ name='sd_kpt11',
+ id=285,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt15'),
+ 286:
+ dict(
+ name='sd_kpt12',
+ id=286,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt14'),
+ 287:
+ dict(name='sd_kpt13', id=287, color=colors['sd'], type='', swap=''),
+ 288:
+ dict(
+ name='sd_kpt14',
+ id=288,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt12'),
+ 289:
+ dict(
+ name='sd_kpt15',
+ id=289,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt11'),
+ 290:
+ dict(
+ name='sd_kpt16',
+ id=290,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt10'),
+ 291:
+ dict(
+ name='sd_kpt17',
+ id=291,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt9'),
+ 292:
+ dict(
+ name='sd_kpt18',
+ id=292,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt8'),
+ 293:
+ dict(
+ name='sd_kpt19',
+ id=293,
+ color=colors['sd'],
+ type='',
+ swap='sd_kpt7'),
+ },
+ skeleton_info={
+ # short_sleeved_shirt
+ 0:
+ dict(link=('sss_kpt1', 'sss_kpt2'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('sss_kpt2', 'sss_kpt7'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('sss_kpt7', 'sss_kpt8'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('sss_kpt8', 'sss_kpt9'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('sss_kpt9', 'sss_kpt10'), id=4, color=[255, 128, 0]),
+ 5:
+ dict(link=('sss_kpt10', 'sss_kpt11'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(link=('sss_kpt11', 'sss_kpt12'), id=6, color=[255, 128, 0]),
+ 7:
+ dict(link=('sss_kpt12', 'sss_kpt13'), id=7, color=[255, 128, 0]),
+ 8:
+ dict(link=('sss_kpt13', 'sss_kpt14'), id=8, color=[255, 128, 0]),
+ 9:
+ dict(link=('sss_kpt14', 'sss_kpt15'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('sss_kpt15', 'sss_kpt16'), id=10, color=[255, 128, 0]),
+ 11:
+ dict(link=('sss_kpt16', 'sss_kpt17'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('sss_kpt17', 'sss_kpt18'), id=12, color=[255, 128, 0]),
+ 13:
+ dict(link=('sss_kpt18', 'sss_kpt19'), id=13, color=[255, 128, 0]),
+ 14:
+ dict(link=('sss_kpt19', 'sss_kpt20'), id=14, color=[255, 128, 0]),
+ 15:
+ dict(link=('sss_kpt20', 'sss_kpt21'), id=15, color=[255, 128, 0]),
+ 16:
+ dict(link=('sss_kpt21', 'sss_kpt22'), id=16, color=[255, 128, 0]),
+ 17:
+ dict(link=('sss_kpt22', 'sss_kpt23'), id=17, color=[255, 128, 0]),
+ 18:
+ dict(link=('sss_kpt23', 'sss_kpt24'), id=18, color=[255, 128, 0]),
+ 19:
+ dict(link=('sss_kpt24', 'sss_kpt25'), id=19, color=[255, 128, 0]),
+ 20:
+ dict(link=('sss_kpt25', 'sss_kpt6'), id=20, color=[255, 128, 0]),
+ 21:
+ dict(link=('sss_kpt6', 'sss_kpt1'), id=21, color=[255, 128, 0]),
+ 22:
+ dict(link=('sss_kpt2', 'sss_kpt3'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(link=('sss_kpt3', 'sss_kpt4'), id=23, color=[255, 128, 0]),
+ 24:
+ dict(link=('sss_kpt4', 'sss_kpt5'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(link=('sss_kpt5', 'sss_kpt6'), id=25, color=[255, 128, 0]),
+ # long_sleeve_shirt
+ 26:
+ dict(link=('lss_kpt1', 'lss_kpt2'), id=26, color=[255, 0, 128]),
+ 27:
+ dict(link=('lss_kpt2', 'lss_kpt7'), id=27, color=[255, 0, 128]),
+ 28:
+ dict(link=('lss_kpt7', 'lss_kpt8'), id=28, color=[255, 0, 128]),
+ 29:
+ dict(link=('lss_kpt8', 'lss_kpt9'), id=29, color=[255, 0, 128]),
+ 30:
+ dict(link=('lss_kpt9', 'lss_kpt10'), id=30, color=[255, 0, 128]),
+ 31:
+ dict(link=('lss_kpt10', 'lss_kpt11'), id=31, color=[255, 0, 128]),
+ 32:
+ dict(link=('lss_kpt11', 'lss_kpt12'), id=32, color=[255, 0, 128]),
+ 33:
+ dict(link=('lss_kpt12', 'lss_kpt13'), id=33, color=[255, 0, 128]),
+ 34:
+ dict(link=('lss_kpt13', 'lss_kpt14'), id=34, color=[255, 0, 128]),
+ 35:
+ dict(link=('lss_kpt14', 'lss_kpt15'), id=35, color=[255, 0, 128]),
+ 36:
+ dict(link=('lss_kpt15', 'lss_kpt16'), id=36, color=[255, 0, 128]),
+ 37:
+ dict(link=('lss_kpt16', 'lss_kpt17'), id=37, color=[255, 0, 128]),
+ 38:
+ dict(link=('lss_kpt17', 'lss_kpt18'), id=38, color=[255, 0, 128]),
+ 39:
+ dict(link=('lss_kpt18', 'lss_kpt19'), id=39, color=[255, 0, 128]),
+ 40:
+ dict(link=('lss_kpt19', 'lss_kpt20'), id=40, color=[255, 0, 128]),
+ 41:
+ dict(link=('lss_kpt20', 'lss_kpt21'), id=41, color=[255, 0, 128]),
+ 42:
+ dict(link=('lss_kpt21', 'lss_kpt22'), id=42, color=[255, 0, 128]),
+ 43:
+ dict(link=('lss_kpt22', 'lss_kpt23'), id=43, color=[255, 0, 128]),
+ 44:
+ dict(link=('lss_kpt23', 'lss_kpt24'), id=44, color=[255, 0, 128]),
+ 45:
+ dict(link=('lss_kpt24', 'lss_kpt25'), id=45, color=[255, 0, 128]),
+ 46:
+ dict(link=('lss_kpt25', 'lss_kpt26'), id=46, color=[255, 0, 128]),
+ 47:
+ dict(link=('lss_kpt26', 'lss_kpt27'), id=47, color=[255, 0, 128]),
+ 48:
+ dict(link=('lss_kpt27', 'lss_kpt28'), id=48, color=[255, 0, 128]),
+ 49:
+ dict(link=('lss_kpt28', 'lss_kpt29'), id=49, color=[255, 0, 128]),
+ 50:
+ dict(link=('lss_kpt29', 'lss_kpt30'), id=50, color=[255, 0, 128]),
+ 51:
+ dict(link=('lss_kpt30', 'lss_kpt31'), id=51, color=[255, 0, 128]),
+ 52:
+ dict(link=('lss_kpt31', 'lss_kpt32'), id=52, color=[255, 0, 128]),
+ 53:
+ dict(link=('lss_kpt32', 'lss_kpt33'), id=53, color=[255, 0, 128]),
+ 54:
+ dict(link=('lss_kpt33', 'lss_kpt6'), id=54, color=[255, 0, 128]),
+ 55:
+ dict(link=('lss_kpt6', 'lss_kpt5'), id=55, color=[255, 0, 128]),
+ 56:
+ dict(link=('lss_kpt5', 'lss_kpt4'), id=56, color=[255, 0, 128]),
+ 57:
+ dict(link=('lss_kpt4', 'lss_kpt3'), id=57, color=[255, 0, 128]),
+ 58:
+ dict(link=('lss_kpt3', 'lss_kpt2'), id=58, color=[255, 0, 128]),
+ 59:
+ dict(link=('lss_kpt6', 'lss_kpt1'), id=59, color=[255, 0, 128]),
+ # short_sleeved_outwear
+ 60:
+ dict(link=('sso_kpt1', 'sso_kpt4'), id=60, color=[128, 0, 255]),
+ 61:
+ dict(link=('sso_kpt4', 'sso_kpt7'), id=61, color=[128, 0, 255]),
+ 62:
+ dict(link=('sso_kpt7', 'sso_kpt8'), id=62, color=[128, 0, 255]),
+ 63:
+ dict(link=('sso_kpt8', 'sso_kpt9'), id=63, color=[128, 0, 255]),
+ 64:
+ dict(link=('sso_kpt9', 'sso_kpt10'), id=64, color=[128, 0, 255]),
+ 65:
+ dict(link=('sso_kpt10', 'sso_kpt11'), id=65, color=[128, 0, 255]),
+ 66:
+ dict(link=('sso_kpt11', 'sso_kpt12'), id=66, color=[128, 0, 255]),
+ 67:
+ dict(link=('sso_kpt12', 'sso_kpt13'), id=67, color=[128, 0, 255]),
+ 68:
+ dict(link=('sso_kpt13', 'sso_kpt14'), id=68, color=[128, 0, 255]),
+ 69:
+ dict(link=('sso_kpt14', 'sso_kpt15'), id=69, color=[128, 0, 255]),
+ 70:
+ dict(link=('sso_kpt15', 'sso_kpt16'), id=70, color=[128, 0, 255]),
+ 71:
+ dict(link=('sso_kpt16', 'sso_kpt31'), id=71, color=[128, 0, 255]),
+ 72:
+ dict(link=('sso_kpt31', 'sso_kpt30'), id=72, color=[128, 0, 255]),
+ 73:
+ dict(link=('sso_kpt30', 'sso_kpt2'), id=73, color=[128, 0, 255]),
+ 74:
+ dict(link=('sso_kpt2', 'sso_kpt3'), id=74, color=[128, 0, 255]),
+ 75:
+ dict(link=('sso_kpt3', 'sso_kpt4'), id=75, color=[128, 0, 255]),
+ 76:
+ dict(link=('sso_kpt1', 'sso_kpt6'), id=76, color=[128, 0, 255]),
+ 77:
+ dict(link=('sso_kpt6', 'sso_kpt25'), id=77, color=[128, 0, 255]),
+ 78:
+ dict(link=('sso_kpt25', 'sso_kpt24'), id=78, color=[128, 0, 255]),
+ 79:
+ dict(link=('sso_kpt24', 'sso_kpt23'), id=79, color=[128, 0, 255]),
+ 80:
+ dict(link=('sso_kpt23', 'sso_kpt22'), id=80, color=[128, 0, 255]),
+ 81:
+ dict(link=('sso_kpt22', 'sso_kpt21'), id=81, color=[128, 0, 255]),
+ 82:
+ dict(link=('sso_kpt21', 'sso_kpt20'), id=82, color=[128, 0, 255]),
+ 83:
+ dict(link=('sso_kpt20', 'sso_kpt19'), id=83, color=[128, 0, 255]),
+ 84:
+ dict(link=('sso_kpt19', 'sso_kpt18'), id=84, color=[128, 0, 255]),
+ 85:
+ dict(link=('sso_kpt18', 'sso_kpt17'), id=85, color=[128, 0, 255]),
+ 86:
+ dict(link=('sso_kpt17', 'sso_kpt29'), id=86, color=[128, 0, 255]),
+ 87:
+ dict(link=('sso_kpt29', 'sso_kpt28'), id=87, color=[128, 0, 255]),
+ 88:
+ dict(link=('sso_kpt28', 'sso_kpt27'), id=88, color=[128, 0, 255]),
+ 89:
+ dict(link=('sso_kpt27', 'sso_kpt26'), id=89, color=[128, 0, 255]),
+ 90:
+ dict(link=('sso_kpt26', 'sso_kpt5'), id=90, color=[128, 0, 255]),
+ 91:
+ dict(link=('sso_kpt5', 'sso_kpt6'), id=91, color=[128, 0, 255]),
+ # long_sleeved_outwear
+ 92:
+ dict(link=('lso_kpt1', 'lso_kpt2'), id=92, color=[0, 128, 255]),
+ 93:
+ dict(link=('lso_kpt2', 'lso_kpt7'), id=93, color=[0, 128, 255]),
+ 94:
+ dict(link=('lso_kpt7', 'lso_kpt8'), id=94, color=[0, 128, 255]),
+ 95:
+ dict(link=('lso_kpt8', 'lso_kpt9'), id=95, color=[0, 128, 255]),
+ 96:
+ dict(link=('lso_kpt9', 'lso_kpt10'), id=96, color=[0, 128, 255]),
+ 97:
+ dict(link=('lso_kpt10', 'lso_kpt11'), id=97, color=[0, 128, 255]),
+ 98:
+ dict(link=('lso_kpt11', 'lso_kpt12'), id=98, color=[0, 128, 255]),
+ 99:
+ dict(link=('lso_kpt12', 'lso_kpt13'), id=99, color=[0, 128, 255]),
+ 100:
+ dict(link=('lso_kpt13', 'lso_kpt14'), id=100, color=[0, 128, 255]),
+ 101:
+ dict(link=('lso_kpt14', 'lso_kpt15'), id=101, color=[0, 128, 255]),
+ 102:
+ dict(link=('lso_kpt15', 'lso_kpt16'), id=102, color=[0, 128, 255]),
+ 103:
+ dict(link=('lso_kpt16', 'lso_kpt17'), id=103, color=[0, 128, 255]),
+ 104:
+ dict(link=('lso_kpt17', 'lso_kpt18'), id=104, color=[0, 128, 255]),
+ 105:
+ dict(link=('lso_kpt18', 'lso_kpt19'), id=105, color=[0, 128, 255]),
+ 106:
+ dict(link=('lso_kpt19', 'lso_kpt20'), id=106, color=[0, 128, 255]),
+ 107:
+ dict(link=('lso_kpt20', 'lso_kpt39'), id=107, color=[0, 128, 255]),
+ 108:
+ dict(link=('lso_kpt39', 'lso_kpt38'), id=108, color=[0, 128, 255]),
+ 109:
+ dict(link=('lso_kpt38', 'lso_kpt4'), id=109, color=[0, 128, 255]),
+ 110:
+ dict(link=('lso_kpt4', 'lso_kpt3'), id=110, color=[0, 128, 255]),
+ 111:
+ dict(link=('lso_kpt3', 'lso_kpt2'), id=111, color=[0, 128, 255]),
+ 112:
+ dict(link=('lso_kpt1', 'lso_kpt6'), id=112, color=[0, 128, 255]),
+ 113:
+ dict(link=('lso_kpt6', 'lso_kpt33'), id=113, color=[0, 128, 255]),
+ 114:
+ dict(link=('lso_kpt33', 'lso_kpt32'), id=114, color=[0, 128, 255]),
+ 115:
+ dict(link=('lso_kpt32', 'lso_kpt31'), id=115, color=[0, 128, 255]),
+ 116:
+ dict(link=('lso_kpt31', 'lso_kpt30'), id=116, color=[0, 128, 255]),
+ 117:
+ dict(link=('lso_kpt30', 'lso_kpt29'), id=117, color=[0, 128, 255]),
+ 118:
+ dict(link=('lso_kpt29', 'lso_kpt28'), id=118, color=[0, 128, 255]),
+ 119:
+ dict(link=('lso_kpt28', 'lso_kpt27'), id=119, color=[0, 128, 255]),
+ 120:
+ dict(link=('lso_kpt27', 'lso_kpt26'), id=120, color=[0, 128, 255]),
+ 121:
+ dict(link=('lso_kpt26', 'lso_kpt25'), id=121, color=[0, 128, 255]),
+ 122:
+ dict(link=('lso_kpt25', 'lso_kpt24'), id=122, color=[0, 128, 255]),
+ 123:
+ dict(link=('lso_kpt24', 'lso_kpt23'), id=123, color=[0, 128, 255]),
+ 124:
+ dict(link=('lso_kpt23', 'lso_kpt22'), id=124, color=[0, 128, 255]),
+ 125:
+ dict(link=('lso_kpt22', 'lso_kpt21'), id=125, color=[0, 128, 255]),
+ 126:
+ dict(link=('lso_kpt21', 'lso_kpt37'), id=126, color=[0, 128, 255]),
+ 127:
+ dict(link=('lso_kpt37', 'lso_kpt36'), id=127, color=[0, 128, 255]),
+ 128:
+ dict(link=('lso_kpt36', 'lso_kpt35'), id=128, color=[0, 128, 255]),
+ 129:
+ dict(link=('lso_kpt35', 'lso_kpt34'), id=129, color=[0, 128, 255]),
+ 130:
+ dict(link=('lso_kpt34', 'lso_kpt5'), id=130, color=[0, 128, 255]),
+ 131:
+ dict(link=('lso_kpt5', 'lso_kpt6'), id=131, color=[0, 128, 255]),
+ # vest
+ 132:
+ dict(link=('vest_kpt1', 'vest_kpt2'), id=132, color=[0, 128, 128]),
+ 133:
+ dict(link=('vest_kpt2', 'vest_kpt7'), id=133, color=[0, 128, 128]),
+ 134:
+ dict(link=('vest_kpt7', 'vest_kpt8'), id=134, color=[0, 128, 128]),
+ 135:
+ dict(link=('vest_kpt8', 'vest_kpt9'), id=135, color=[0, 128, 128]),
+ 136:
+ dict(link=('vest_kpt9', 'vest_kpt10'), id=136, color=[0, 128, 128]),
+ 137:
+ dict(link=('vest_kpt10', 'vest_kpt11'), id=137, color=[0, 128, 128]),
+ 138:
+ dict(link=('vest_kpt11', 'vest_kpt12'), id=138, color=[0, 128, 128]),
+ 139:
+ dict(link=('vest_kpt12', 'vest_kpt13'), id=139, color=[0, 128, 128]),
+ 140:
+ dict(link=('vest_kpt13', 'vest_kpt14'), id=140, color=[0, 128, 128]),
+ 141:
+ dict(link=('vest_kpt14', 'vest_kpt15'), id=141, color=[0, 128, 128]),
+ 142:
+ dict(link=('vest_kpt15', 'vest_kpt6'), id=142, color=[0, 128, 128]),
+ 143:
+ dict(link=('vest_kpt6', 'vest_kpt1'), id=143, color=[0, 128, 128]),
+ 144:
+ dict(link=('vest_kpt2', 'vest_kpt3'), id=144, color=[0, 128, 128]),
+ 145:
+ dict(link=('vest_kpt3', 'vest_kpt4'), id=145, color=[0, 128, 128]),
+ 146:
+ dict(link=('vest_kpt4', 'vest_kpt5'), id=146, color=[0, 128, 128]),
+ 147:
+ dict(link=('vest_kpt5', 'vest_kpt6'), id=147, color=[0, 128, 128]),
+ # sling
+ 148:
+ dict(link=('sling_kpt1', 'sling_kpt2'), id=148, color=[0, 0, 128]),
+ 149:
+ dict(link=('sling_kpt2', 'sling_kpt8'), id=149, color=[0, 0, 128]),
+ 150:
+ dict(link=('sling_kpt8', 'sling_kpt9'), id=150, color=[0, 0, 128]),
+ 151:
+ dict(link=('sling_kpt9', 'sling_kpt10'), id=151, color=[0, 0, 128]),
+ 152:
+ dict(link=('sling_kpt10', 'sling_kpt11'), id=152, color=[0, 0, 128]),
+ 153:
+ dict(link=('sling_kpt11', 'sling_kpt12'), id=153, color=[0, 0, 128]),
+ 154:
+ dict(link=('sling_kpt12', 'sling_kpt13'), id=154, color=[0, 0, 128]),
+ 155:
+ dict(link=('sling_kpt13', 'sling_kpt14'), id=155, color=[0, 0, 128]),
+ 156:
+ dict(link=('sling_kpt14', 'sling_kpt6'), id=156, color=[0, 0, 128]),
+ 157:
+ dict(link=('sling_kpt2', 'sling_kpt7'), id=157, color=[0, 0, 128]),
+ 158:
+ dict(link=('sling_kpt6', 'sling_kpt15'), id=158, color=[0, 0, 128]),
+ 159:
+ dict(link=('sling_kpt2', 'sling_kpt3'), id=159, color=[0, 0, 128]),
+ 160:
+ dict(link=('sling_kpt3', 'sling_kpt4'), id=160, color=[0, 0, 128]),
+ 161:
+ dict(link=('sling_kpt4', 'sling_kpt5'), id=161, color=[0, 0, 128]),
+ 162:
+ dict(link=('sling_kpt5', 'sling_kpt6'), id=162, color=[0, 0, 128]),
+ 163:
+ dict(link=('sling_kpt1', 'sling_kpt6'), id=163, color=[0, 0, 128]),
+ # shorts
+ 164:
+ dict(
+ link=('shorts_kpt1', 'shorts_kpt4'), id=164, color=[128, 128,
+ 128]),
+ 165:
+ dict(
+ link=('shorts_kpt4', 'shorts_kpt5'), id=165, color=[128, 128,
+ 128]),
+ 166:
+ dict(
+ link=('shorts_kpt5', 'shorts_kpt6'), id=166, color=[128, 128,
+ 128]),
+ 167:
+ dict(
+ link=('shorts_kpt6', 'shorts_kpt7'), id=167, color=[128, 128,
+ 128]),
+ 168:
+ dict(
+ link=('shorts_kpt7', 'shorts_kpt8'), id=168, color=[128, 128,
+ 128]),
+ 169:
+ dict(
+ link=('shorts_kpt8', 'shorts_kpt9'), id=169, color=[128, 128,
+ 128]),
+ 170:
+ dict(
+ link=('shorts_kpt9', 'shorts_kpt10'),
+ id=170,
+ color=[128, 128, 128]),
+ 171:
+ dict(
+ link=('shorts_kpt10', 'shorts_kpt3'),
+ id=171,
+ color=[128, 128, 128]),
+ 172:
+ dict(
+ link=('shorts_kpt3', 'shorts_kpt2'), id=172, color=[128, 128,
+ 128]),
+ 173:
+ dict(
+ link=('shorts_kpt2', 'shorts_kpt1'), id=173, color=[128, 128,
+ 128]),
+ # trousers
+ 174:
+ dict(
+ link=('trousers_kpt1', 'trousers_kpt4'),
+ id=174,
+ color=[128, 0, 128]),
+ 175:
+ dict(
+ link=('trousers_kpt4', 'trousers_kpt5'),
+ id=175,
+ color=[128, 0, 128]),
+ 176:
+ dict(
+ link=('trousers_kpt5', 'trousers_kpt6'),
+ id=176,
+ color=[128, 0, 128]),
+ 177:
+ dict(
+ link=('trousers_kpt6', 'trousers_kpt7'),
+ id=177,
+ color=[128, 0, 128]),
+ 178:
+ dict(
+ link=('trousers_kpt7', 'trousers_kpt8'),
+ id=178,
+ color=[128, 0, 128]),
+ 179:
+ dict(
+ link=('trousers_kpt8', 'trousers_kpt9'),
+ id=179,
+ color=[128, 0, 128]),
+ 180:
+ dict(
+ link=('trousers_kpt9', 'trousers_kpt10'),
+ id=180,
+ color=[128, 0, 128]),
+ 181:
+ dict(
+ link=('trousers_kpt10', 'trousers_kpt11'),
+ id=181,
+ color=[128, 0, 128]),
+ 182:
+ dict(
+ link=('trousers_kpt11', 'trousers_kpt12'),
+ id=182,
+ color=[128, 0, 128]),
+ 183:
+ dict(
+ link=('trousers_kpt12', 'trousers_kpt13'),
+ id=183,
+ color=[128, 0, 128]),
+ 184:
+ dict(
+ link=('trousers_kpt13', 'trousers_kpt14'),
+ id=184,
+ color=[128, 0, 128]),
+ 185:
+ dict(
+ link=('trousers_kpt14', 'trousers_kpt3'),
+ id=185,
+ color=[128, 0, 128]),
+ 186:
+ dict(
+ link=('trousers_kpt3', 'trousers_kpt2'),
+ id=186,
+ color=[128, 0, 128]),
+ 187:
+ dict(
+ link=('trousers_kpt2', 'trousers_kpt1'),
+ id=187,
+ color=[128, 0, 128]),
+ # skirt
+ 188:
+ dict(link=('skirt_kpt1', 'skirt_kpt4'), id=188, color=[64, 128, 128]),
+ 189:
+ dict(link=('skirt_kpt4', 'skirt_kpt5'), id=189, color=[64, 128, 128]),
+ 190:
+ dict(link=('skirt_kpt5', 'skirt_kpt6'), id=190, color=[64, 128, 128]),
+ 191:
+ dict(link=('skirt_kpt6', 'skirt_kpt7'), id=191, color=[64, 128, 128]),
+ 192:
+ dict(link=('skirt_kpt7', 'skirt_kpt8'), id=192, color=[64, 128, 128]),
+ 193:
+ dict(link=('skirt_kpt8', 'skirt_kpt3'), id=193, color=[64, 128, 128]),
+ 194:
+ dict(link=('skirt_kpt3', 'skirt_kpt2'), id=194, color=[64, 128, 128]),
+ 195:
+ dict(link=('skirt_kpt2', 'skirt_kpt1'), id=195, color=[64, 128, 128]),
+ # short_sleeved_dress
+ 196:
+ dict(link=('ssd_kpt1', 'ssd_kpt2'), id=196, color=[64, 64, 128]),
+ 197:
+ dict(link=('ssd_kpt2', 'ssd_kpt7'), id=197, color=[64, 64, 128]),
+ 198:
+ dict(link=('ssd_kpt7', 'ssd_kpt8'), id=198, color=[64, 64, 128]),
+ 199:
+ dict(link=('ssd_kpt8', 'ssd_kpt9'), id=199, color=[64, 64, 128]),
+ 200:
+ dict(link=('ssd_kpt9', 'ssd_kpt10'), id=200, color=[64, 64, 128]),
+ 201:
+ dict(link=('ssd_kpt10', 'ssd_kpt11'), id=201, color=[64, 64, 128]),
+ 202:
+ dict(link=('ssd_kpt11', 'ssd_kpt12'), id=202, color=[64, 64, 128]),
+ 203:
+ dict(link=('ssd_kpt12', 'ssd_kpt13'), id=203, color=[64, 64, 128]),
+ 204:
+ dict(link=('ssd_kpt13', 'ssd_kpt14'), id=204, color=[64, 64, 128]),
+ 205:
+ dict(link=('ssd_kpt14', 'ssd_kpt15'), id=205, color=[64, 64, 128]),
+ 206:
+ dict(link=('ssd_kpt15', 'ssd_kpt16'), id=206, color=[64, 64, 128]),
+ 207:
+ dict(link=('ssd_kpt16', 'ssd_kpt17'), id=207, color=[64, 64, 128]),
+ 208:
+ dict(link=('ssd_kpt17', 'ssd_kpt18'), id=208, color=[64, 64, 128]),
+ 209:
+ dict(link=('ssd_kpt18', 'ssd_kpt19'), id=209, color=[64, 64, 128]),
+ 210:
+ dict(link=('ssd_kpt19', 'ssd_kpt20'), id=210, color=[64, 64, 128]),
+ 211:
+ dict(link=('ssd_kpt20', 'ssd_kpt21'), id=211, color=[64, 64, 128]),
+ 212:
+ dict(link=('ssd_kpt21', 'ssd_kpt22'), id=212, color=[64, 64, 128]),
+ 213:
+ dict(link=('ssd_kpt22', 'ssd_kpt23'), id=213, color=[64, 64, 128]),
+ 214:
+ dict(link=('ssd_kpt23', 'ssd_kpt24'), id=214, color=[64, 64, 128]),
+ 215:
+ dict(link=('ssd_kpt24', 'ssd_kpt25'), id=215, color=[64, 64, 128]),
+ 216:
+ dict(link=('ssd_kpt25', 'ssd_kpt26'), id=216, color=[64, 64, 128]),
+ 217:
+ dict(link=('ssd_kpt26', 'ssd_kpt27'), id=217, color=[64, 64, 128]),
+ 218:
+ dict(link=('ssd_kpt27', 'ssd_kpt28'), id=218, color=[64, 64, 128]),
+ 219:
+ dict(link=('ssd_kpt28', 'ssd_kpt29'), id=219, color=[64, 64, 128]),
+ 220:
+ dict(link=('ssd_kpt29', 'ssd_kpt6'), id=220, color=[64, 64, 128]),
+ 221:
+ dict(link=('ssd_kpt6', 'ssd_kpt5'), id=221, color=[64, 64, 128]),
+ 222:
+ dict(link=('ssd_kpt5', 'ssd_kpt4'), id=222, color=[64, 64, 128]),
+ 223:
+ dict(link=('ssd_kpt4', 'ssd_kpt3'), id=223, color=[64, 64, 128]),
+ 224:
+ dict(link=('ssd_kpt3', 'ssd_kpt2'), id=224, color=[64, 64, 128]),
+ 225:
+ dict(link=('ssd_kpt6', 'ssd_kpt1'), id=225, color=[64, 64, 128]),
+ # long_sleeved_dress
+ 226:
+ dict(link=('lsd_kpt1', 'lsd_kpt2'), id=226, color=[128, 64, 0]),
+ 227:
+ dict(link=('lsd_kpt2', 'lsd_kpt7'), id=228, color=[128, 64, 0]),
+ 228:
+ dict(link=('lsd_kpt7', 'lsd_kpt8'), id=228, color=[128, 64, 0]),
+ 229:
+ dict(link=('lsd_kpt8', 'lsd_kpt9'), id=229, color=[128, 64, 0]),
+ 230:
+ dict(link=('lsd_kpt9', 'lsd_kpt10'), id=230, color=[128, 64, 0]),
+ 231:
+ dict(link=('lsd_kpt10', 'lsd_kpt11'), id=231, color=[128, 64, 0]),
+ 232:
+ dict(link=('lsd_kpt11', 'lsd_kpt12'), id=232, color=[128, 64, 0]),
+ 233:
+ dict(link=('lsd_kpt12', 'lsd_kpt13'), id=233, color=[128, 64, 0]),
+ 234:
+ dict(link=('lsd_kpt13', 'lsd_kpt14'), id=234, color=[128, 64, 0]),
+ 235:
+ dict(link=('lsd_kpt14', 'lsd_kpt15'), id=235, color=[128, 64, 0]),
+ 236:
+ dict(link=('lsd_kpt15', 'lsd_kpt16'), id=236, color=[128, 64, 0]),
+ 237:
+ dict(link=('lsd_kpt16', 'lsd_kpt17'), id=237, color=[128, 64, 0]),
+ 238:
+ dict(link=('lsd_kpt17', 'lsd_kpt18'), id=238, color=[128, 64, 0]),
+ 239:
+ dict(link=('lsd_kpt18', 'lsd_kpt19'), id=239, color=[128, 64, 0]),
+ 240:
+ dict(link=('lsd_kpt19', 'lsd_kpt20'), id=240, color=[128, 64, 0]),
+ 241:
+ dict(link=('lsd_kpt20', 'lsd_kpt21'), id=241, color=[128, 64, 0]),
+ 242:
+ dict(link=('lsd_kpt21', 'lsd_kpt22'), id=242, color=[128, 64, 0]),
+ 243:
+ dict(link=('lsd_kpt22', 'lsd_kpt23'), id=243, color=[128, 64, 0]),
+ 244:
+ dict(link=('lsd_kpt23', 'lsd_kpt24'), id=244, color=[128, 64, 0]),
+ 245:
+ dict(link=('lsd_kpt24', 'lsd_kpt25'), id=245, color=[128, 64, 0]),
+ 246:
+ dict(link=('lsd_kpt25', 'lsd_kpt26'), id=246, color=[128, 64, 0]),
+ 247:
+ dict(link=('lsd_kpt26', 'lsd_kpt27'), id=247, color=[128, 64, 0]),
+ 248:
+ dict(link=('lsd_kpt27', 'lsd_kpt28'), id=248, color=[128, 64, 0]),
+ 249:
+ dict(link=('lsd_kpt28', 'lsd_kpt29'), id=249, color=[128, 64, 0]),
+ 250:
+ dict(link=('lsd_kpt29', 'lsd_kpt30'), id=250, color=[128, 64, 0]),
+ 251:
+ dict(link=('lsd_kpt30', 'lsd_kpt31'), id=251, color=[128, 64, 0]),
+ 252:
+ dict(link=('lsd_kpt31', 'lsd_kpt32'), id=252, color=[128, 64, 0]),
+ 253:
+ dict(link=('lsd_kpt32', 'lsd_kpt33'), id=253, color=[128, 64, 0]),
+ 254:
+ dict(link=('lsd_kpt33', 'lsd_kpt34'), id=254, color=[128, 64, 0]),
+ 255:
+ dict(link=('lsd_kpt34', 'lsd_kpt35'), id=255, color=[128, 64, 0]),
+ 256:
+ dict(link=('lsd_kpt35', 'lsd_kpt36'), id=256, color=[128, 64, 0]),
+ 257:
+ dict(link=('lsd_kpt36', 'lsd_kpt37'), id=257, color=[128, 64, 0]),
+ 258:
+ dict(link=('lsd_kpt37', 'lsd_kpt6'), id=258, color=[128, 64, 0]),
+ 259:
+ dict(link=('lsd_kpt6', 'lsd_kpt5'), id=259, color=[128, 64, 0]),
+ 260:
+ dict(link=('lsd_kpt5', 'lsd_kpt4'), id=260, color=[128, 64, 0]),
+ 261:
+ dict(link=('lsd_kpt4', 'lsd_kpt3'), id=261, color=[128, 64, 0]),
+ 262:
+ dict(link=('lsd_kpt3', 'lsd_kpt2'), id=262, color=[128, 64, 0]),
+ 263:
+ dict(link=('lsd_kpt6', 'lsd_kpt1'), id=263, color=[128, 64, 0]),
+ # vest_dress
+ 264:
+ dict(link=('vd_kpt1', 'vd_kpt2'), id=264, color=[128, 64, 255]),
+ 265:
+ dict(link=('vd_kpt2', 'vd_kpt7'), id=265, color=[128, 64, 255]),
+ 266:
+ dict(link=('vd_kpt7', 'vd_kpt8'), id=266, color=[128, 64, 255]),
+ 267:
+ dict(link=('vd_kpt8', 'vd_kpt9'), id=267, color=[128, 64, 255]),
+ 268:
+ dict(link=('vd_kpt9', 'vd_kpt10'), id=268, color=[128, 64, 255]),
+ 269:
+ dict(link=('vd_kpt10', 'vd_kpt11'), id=269, color=[128, 64, 255]),
+ 270:
+ dict(link=('vd_kpt11', 'vd_kpt12'), id=270, color=[128, 64, 255]),
+ 271:
+ dict(link=('vd_kpt12', 'vd_kpt13'), id=271, color=[128, 64, 255]),
+ 272:
+ dict(link=('vd_kpt13', 'vd_kpt14'), id=272, color=[128, 64, 255]),
+ 273:
+ dict(link=('vd_kpt14', 'vd_kpt15'), id=273, color=[128, 64, 255]),
+ 274:
+ dict(link=('vd_kpt15', 'vd_kpt16'), id=274, color=[128, 64, 255]),
+ 275:
+ dict(link=('vd_kpt16', 'vd_kpt17'), id=275, color=[128, 64, 255]),
+ 276:
+ dict(link=('vd_kpt17', 'vd_kpt18'), id=276, color=[128, 64, 255]),
+ 277:
+ dict(link=('vd_kpt18', 'vd_kpt19'), id=277, color=[128, 64, 255]),
+ 278:
+ dict(link=('vd_kpt19', 'vd_kpt6'), id=278, color=[128, 64, 255]),
+ 279:
+ dict(link=('vd_kpt6', 'vd_kpt5'), id=279, color=[128, 64, 255]),
+ 280:
+ dict(link=('vd_kpt5', 'vd_kpt4'), id=280, color=[128, 64, 255]),
+ 281:
+ dict(link=('vd_kpt4', 'vd_kpt3'), id=281, color=[128, 64, 255]),
+ 282:
+ dict(link=('vd_kpt3', 'vd_kpt2'), id=282, color=[128, 64, 255]),
+ 283:
+ dict(link=('vd_kpt6', 'vd_kpt1'), id=283, color=[128, 64, 255]),
+ # sling_dress
+ 284:
+ dict(link=('sd_kpt1', 'sd_kpt2'), id=284, color=[128, 64, 0]),
+ 285:
+ dict(link=('sd_kpt2', 'sd_kpt8'), id=285, color=[128, 64, 0]),
+ 286:
+ dict(link=('sd_kpt8', 'sd_kpt9'), id=286, color=[128, 64, 0]),
+ 287:
+ dict(link=('sd_kpt9', 'sd_kpt10'), id=287, color=[128, 64, 0]),
+ 288:
+ dict(link=('sd_kpt10', 'sd_kpt11'), id=288, color=[128, 64, 0]),
+ 289:
+ dict(link=('sd_kpt11', 'sd_kpt12'), id=289, color=[128, 64, 0]),
+ 290:
+ dict(link=('sd_kpt12', 'sd_kpt13'), id=290, color=[128, 64, 0]),
+ 291:
+ dict(link=('sd_kpt13', 'sd_kpt14'), id=291, color=[128, 64, 0]),
+ 292:
+ dict(link=('sd_kpt14', 'sd_kpt15'), id=292, color=[128, 64, 0]),
+ 293:
+ dict(link=('sd_kpt15', 'sd_kpt16'), id=293, color=[128, 64, 0]),
+ 294:
+ dict(link=('sd_kpt16', 'sd_kpt17'), id=294, color=[128, 64, 0]),
+ 295:
+ dict(link=('sd_kpt17', 'sd_kpt18'), id=295, color=[128, 64, 0]),
+ 296:
+ dict(link=('sd_kpt18', 'sd_kpt6'), id=296, color=[128, 64, 0]),
+ 297:
+ dict(link=('sd_kpt6', 'sd_kpt5'), id=297, color=[128, 64, 0]),
+ 298:
+ dict(link=('sd_kpt5', 'sd_kpt4'), id=298, color=[128, 64, 0]),
+ 299:
+ dict(link=('sd_kpt4', 'sd_kpt3'), id=299, color=[128, 64, 0]),
+ 300:
+ dict(link=('sd_kpt3', 'sd_kpt2'), id=300, color=[128, 64, 0]),
+ 301:
+ dict(link=('sd_kpt2', 'sd_kpt7'), id=301, color=[128, 64, 0]),
+ 302:
+ dict(link=('sd_kpt6', 'sd_kpt19'), id=302, color=[128, 64, 0]),
+ 303:
+ dict(link=('sd_kpt6', 'sd_kpt1'), id=303, color=[128, 64, 0]),
+ },
+ joint_weights=[1.] * 294,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py
new file mode 100644
index 0000000..9769127
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_full.py
@@ -0,0 +1,74 @@
+dataset_info = dict(
+ dataset_name='deepfashion_full',
+ paper_info=dict(
+ author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+ 'and Wang, Xiaogang and Tang, Xiaoou',
+ title='DeepFashion: Powering Robust Clothes Recognition '
+ 'and Retrieval with Rich Annotations',
+ container='Proceedings of IEEE Conference on Computer '
+ 'Vision and Pattern Recognition (CVPR)',
+ year='2016',
+ homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+ 'DeepFashion/LandmarkDetection.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left collar',
+ id=0,
+ color=[255, 255, 255],
+ type='',
+ swap='right collar'),
+ 1:
+ dict(
+ name='right collar',
+ id=1,
+ color=[255, 255, 255],
+ type='',
+ swap='left collar'),
+ 2:
+ dict(
+ name='left sleeve',
+ id=2,
+ color=[255, 255, 255],
+ type='',
+ swap='right sleeve'),
+ 3:
+ dict(
+ name='right sleeve',
+ id=3,
+ color=[255, 255, 255],
+ type='',
+ swap='left sleeve'),
+ 4:
+ dict(
+ name='left waistline',
+ id=0,
+ color=[255, 255, 255],
+ type='',
+ swap='right waistline'),
+ 5:
+ dict(
+ name='right waistline',
+ id=1,
+ color=[255, 255, 255],
+ type='',
+ swap='left waistline'),
+ 6:
+ dict(
+ name='left hem',
+ id=2,
+ color=[255, 255, 255],
+ type='',
+ swap='right hem'),
+ 7:
+ dict(
+ name='right hem',
+ id=3,
+ color=[255, 255, 255],
+ type='',
+ swap='left hem'),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 8,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py
new file mode 100644
index 0000000..65995e1
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_lower.py
@@ -0,0 +1,46 @@
+dataset_info = dict(
+ dataset_name='deepfashion_lower',
+ paper_info=dict(
+ author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+ 'and Wang, Xiaogang and Tang, Xiaoou',
+ title='DeepFashion: Powering Robust Clothes Recognition '
+ 'and Retrieval with Rich Annotations',
+ container='Proceedings of IEEE Conference on Computer '
+ 'Vision and Pattern Recognition (CVPR)',
+ year='2016',
+ homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+ 'DeepFashion/LandmarkDetection.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left waistline',
+ id=0,
+ color=[255, 255, 255],
+ type='',
+ swap='right waistline'),
+ 1:
+ dict(
+ name='right waistline',
+ id=1,
+ color=[255, 255, 255],
+ type='',
+ swap='left waistline'),
+ 2:
+ dict(
+ name='left hem',
+ id=2,
+ color=[255, 255, 255],
+ type='',
+ swap='right hem'),
+ 3:
+ dict(
+ name='right hem',
+ id=3,
+ color=[255, 255, 255],
+ type='',
+ swap='left hem'),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 4,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py b/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py
new file mode 100644
index 0000000..4f34e2a
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/deepfashion_upper.py
@@ -0,0 +1,60 @@
+dataset_info = dict(
+ dataset_name='deepfashion_upper',
+ paper_info=dict(
+ author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+ 'and Wang, Xiaogang and Tang, Xiaoou',
+ title='DeepFashion: Powering Robust Clothes Recognition '
+ 'and Retrieval with Rich Annotations',
+ container='Proceedings of IEEE Conference on Computer '
+ 'Vision and Pattern Recognition (CVPR)',
+ year='2016',
+ homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+ 'DeepFashion/LandmarkDetection.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left collar',
+ id=0,
+ color=[255, 255, 255],
+ type='',
+ swap='right collar'),
+ 1:
+ dict(
+ name='right collar',
+ id=1,
+ color=[255, 255, 255],
+ type='',
+ swap='left collar'),
+ 2:
+ dict(
+ name='left sleeve',
+ id=2,
+ color=[255, 255, 255],
+ type='',
+ swap='right sleeve'),
+ 3:
+ dict(
+ name='right sleeve',
+ id=3,
+ color=[255, 255, 255],
+ type='',
+ swap='left sleeve'),
+ 4:
+ dict(
+ name='left hem',
+ id=4,
+ color=[255, 255, 255],
+ type='',
+ swap='right hem'),
+ 5:
+ dict(
+ name='right hem',
+ id=5,
+ color=[255, 255, 255],
+ type='',
+ swap='left hem'),
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 6,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/exlpose.py b/modules/rtmpose/configs/_base_/datasets/exlpose.py
new file mode 100644
index 0000000..0a27a74
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/exlpose.py
@@ -0,0 +1,125 @@
+dataset_info = dict(
+ dataset_name='exlpose',
+ paper_info=dict(
+ author='Sohyun Lee, Jaesung Rim, Boseung Jeong, Geonu Kim,'
+ 'ByungJu Woo, Haechan Lee, Sunghyun Cho, Suha Kwak',
+ title='Human Pose Estimation in Extremely Low-Light Conditions',
+ container='arXiv',
+ year='2023',
+ homepage='https://arxiv.org/abs/2303.15410',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left_shoulder',
+ id=0,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 1:
+ dict(
+ name='right_shoulder',
+ id=1,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 2:
+ dict(
+ name='left_elbow',
+ id=2,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='left_wrist',
+ id=4,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 5:
+ dict(
+ name='right_wrist',
+ id=5,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 6:
+ dict(
+ name='left_hip',
+ id=6,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 7:
+ dict(
+ name='right_hip',
+ id=7,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 8:
+ dict(
+ name='left_knee',
+ id=8,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='left_ankle',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 11:
+ dict(
+ name='right_ankle',
+ id=11,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 12:
+ dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+ 13:
+ dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+ },
+ skeleton_info={
+ 0: dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+ 1: dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+ 2: dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+ 3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('left_elbow', 'left_wrist'), id=4, color=[0, 255, 0]),
+ 5: dict(
+ link=('right_shoulder', 'right_elbow'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+ 7: dict(link=('neck', 'right_hip'), id=7, color=[51, 153, 255]),
+ 8: dict(link=('neck', 'left_hip'), id=8, color=[51, 153, 255]),
+ 9: dict(link=('right_hip', 'right_knee'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('right_knee', 'right_ankle'), id=10, color=[255, 128, 0]),
+ 11: dict(link=('left_hip', 'left_knee'), id=11, color=[0, 255, 0]),
+ 12: dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+ },
+ joint_weights=[
+ 0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+ ],
+ sigmas=[
+ 0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+ 0.089, 0.089, 0.079, 0.079
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/fly.py b/modules/rtmpose/configs/_base_/datasets/fly.py
new file mode 100644
index 0000000..46386b6
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/fly.py
@@ -0,0 +1,237 @@
+dataset_info = dict(
+ dataset_name='fly',
+ paper_info=dict(
+ author='Pereira, Talmo D and Aldarondo, Diego E and '
+ 'Willmore, Lindsay and Kislin, Mikhail and '
+ 'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W',
+ title='Fast animal pose estimation using deep neural networks',
+ container='Nature methods',
+ year='2019',
+ homepage='https://github.com/jgraving/DeepPoseKit-Data',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'),
+ 2:
+ dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'),
+ 3:
+ dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''),
+ 4:
+ dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''),
+ 5:
+ dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forelegR1',
+ id=6,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL1'),
+ 7:
+ dict(
+ name='forelegR2',
+ id=7,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL2'),
+ 8:
+ dict(
+ name='forelegR3',
+ id=8,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL3'),
+ 9:
+ dict(
+ name='forelegR4',
+ id=9,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL4'),
+ 10:
+ dict(
+ name='midlegR1',
+ id=10,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL1'),
+ 11:
+ dict(
+ name='midlegR2',
+ id=11,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL2'),
+ 12:
+ dict(
+ name='midlegR3',
+ id=12,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL3'),
+ 13:
+ dict(
+ name='midlegR4',
+ id=13,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL4'),
+ 14:
+ dict(
+ name='hindlegR1',
+ id=14,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL1'),
+ 15:
+ dict(
+ name='hindlegR2',
+ id=15,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL2'),
+ 16:
+ dict(
+ name='hindlegR3',
+ id=16,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL3'),
+ 17:
+ dict(
+ name='hindlegR4',
+ id=17,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL4'),
+ 18:
+ dict(
+ name='forelegL1',
+ id=18,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR1'),
+ 19:
+ dict(
+ name='forelegL2',
+ id=19,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR2'),
+ 20:
+ dict(
+ name='forelegL3',
+ id=20,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR3'),
+ 21:
+ dict(
+ name='forelegL4',
+ id=21,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR4'),
+ 22:
+ dict(
+ name='midlegL1',
+ id=22,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR1'),
+ 23:
+ dict(
+ name='midlegL2',
+ id=23,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR2'),
+ 24:
+ dict(
+ name='midlegL3',
+ id=24,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR3'),
+ 25:
+ dict(
+ name='midlegL4',
+ id=25,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR4'),
+ 26:
+ dict(
+ name='hindlegL1',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR1'),
+ 27:
+ dict(
+ name='hindlegL2',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR2'),
+ 28:
+ dict(
+ name='hindlegL3',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR3'),
+ 29:
+ dict(
+ name='hindlegL4',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR4'),
+ 30:
+ dict(
+ name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'),
+ 31:
+ dict(
+ name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'),
+ },
+ skeleton_info={
+ 0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]),
+ 1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]),
+ 2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]),
+ 3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]),
+ 4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]),
+ 5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]),
+ 6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]),
+ 7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]),
+ 8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]),
+ 9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]),
+ 10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]),
+ 11:
+ dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]),
+ 12:
+ dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]),
+ 13:
+ dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]),
+ 14:
+ dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]),
+ 15:
+ dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]),
+ 16:
+ dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]),
+ 17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]),
+ 18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]),
+ 19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]),
+ 20:
+ dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]),
+ 21:
+ dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]),
+ 22:
+ dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]),
+ 23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]),
+ 24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255])
+ },
+ joint_weights=[1.] * 32,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/freihand2d.py b/modules/rtmpose/configs/_base_/datasets/freihand2d.py
new file mode 100644
index 0000000..ae04742
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/freihand2d.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+ dataset_name='freihand',
+ paper_info=dict(
+ author='Zimmermann, Christian and Ceylan, Duygu and '
+ 'Yang, Jimei and Russell, Bryan and '
+ 'Argus, Max and Brox, Thomas',
+ title='Freihand: A dataset for markerless capture of hand pose '
+ 'and shape from single rgb images',
+ container='Proceedings of the IEEE International '
+ 'Conference on Computer Vision',
+ year='2019',
+ homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+ 9:
+ dict(
+ name='middle_finger1',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger3',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='middle_finger4',
+ id=12,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 13:
+ dict(
+ name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(
+ name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+ 17:
+ dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/h36m.py b/modules/rtmpose/configs/_base_/datasets/h36m.py
new file mode 100644
index 0000000..f6be31f
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/h36m.py
@@ -0,0 +1,152 @@
+dataset_info = dict(
+ dataset_name='h36m',
+ paper_info=dict(
+ author='Ionescu, Catalin and Papava, Dragos and '
+ 'Olaru, Vlad and Sminchisescu, Cristian',
+ title='Human3.6M: Large Scale Datasets and Predictive '
+ 'Methods for 3D Human Sensing in Natural Environments',
+ container='IEEE Transactions on Pattern Analysis and '
+ 'Machine Intelligence',
+ year='2014',
+ homepage='http://vision.imar.ro/human3.6m/description.php',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''),
+ 1:
+ dict(
+ name='right_hip',
+ id=1,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 2:
+ dict(
+ name='right_knee',
+ id=2,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 3:
+ dict(
+ name='right_foot',
+ id=3,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_foot'),
+ 4:
+ dict(
+ name='left_hip',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 5:
+ dict(
+ name='left_knee',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 6:
+ dict(
+ name='left_foot',
+ id=6,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_foot'),
+ 7:
+ dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''),
+ 8:
+ dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''),
+ 9:
+ dict(
+ name='neck_base',
+ id=9,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 10:
+ dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''),
+ 11:
+ dict(
+ name='left_shoulder',
+ id=11,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 12:
+ dict(
+ name='left_elbow',
+ id=12,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 13:
+ dict(
+ name='left_wrist',
+ id=13,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 14:
+ dict(
+ name='right_shoulder',
+ id=14,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 15:
+ dict(
+ name='right_elbow',
+ id=15,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 16:
+ dict(
+ name='right_wrist',
+ id=16,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]),
+ 3:
+ dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]),
+ 5:
+ dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(link=('root', 'spine'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]),
+ 9:
+ dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]),
+ 10:
+ dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]),
+ 12:
+ dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]),
+ 14:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128,
+ 0]),
+ 15:
+ dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0])
+ },
+ joint_weights=[1.] * 17,
+ sigmas=[],
+ stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.))
diff --git a/modules/rtmpose/configs/_base_/datasets/h3wb.py b/modules/rtmpose/configs/_base_/datasets/h3wb.py
new file mode 100644
index 0000000..24e3b4e
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/h3wb.py
@@ -0,0 +1,1151 @@
+dataset_info = dict(
+ dataset_name='h3wb',
+ paper_info=dict(
+ author='Yue Zhu, Nermin Samet, David Picard',
+ title='H3WB: Human3.6M 3D WholeBody Dataset and Benchmark',
+ container='International Conf. on Computer Vision (ICCV)',
+ year='2023',
+ homepage='https://github.com/wholebody3d/wholebody3d',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='left_big_toe',
+ id=17,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 18:
+ dict(
+ name='left_small_toe',
+ id=18,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 19:
+ dict(
+ name='left_heel',
+ id=19,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_heel'),
+ 20:
+ dict(
+ name='right_big_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 21:
+ dict(
+ name='right_small_toe',
+ id=21,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 22:
+ dict(
+ name='right_heel',
+ id=22,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_heel'),
+ 23:
+ dict(
+ name='face-0',
+ id=23,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-16'),
+ 24:
+ dict(
+ name='face-1',
+ id=24,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-15'),
+ 25:
+ dict(
+ name='face-2',
+ id=25,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-14'),
+ 26:
+ dict(
+ name='face-3',
+ id=26,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-13'),
+ 27:
+ dict(
+ name='face-4',
+ id=27,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-12'),
+ 28:
+ dict(
+ name='face-5',
+ id=28,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-11'),
+ 29:
+ dict(
+ name='face-6',
+ id=29,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-10'),
+ 30:
+ dict(
+ name='face-7',
+ id=30,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-9'),
+ 31:
+ dict(name='face-8', id=31, color=[247, 34, 5], type='upper', swap=''),
+ 32:
+ dict(
+ name='face-9',
+ id=32,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-7'),
+ 33:
+ dict(
+ name='face-10',
+ id=33,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-6'),
+ 34:
+ dict(
+ name='face-11',
+ id=34,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-5'),
+ 35:
+ dict(
+ name='face-12',
+ id=35,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-4'),
+ 36:
+ dict(
+ name='face-13',
+ id=36,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-3'),
+ 37:
+ dict(
+ name='face-14',
+ id=37,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-2'),
+ 38:
+ dict(
+ name='face-15',
+ id=38,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-1'),
+ 39:
+ dict(
+ name='face-16',
+ id=39,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-0'),
+ 40:
+ dict(
+ name='face-17',
+ id=40,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-26'),
+ 41:
+ dict(
+ name='face-18',
+ id=41,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-25'),
+ 42:
+ dict(
+ name='face-19',
+ id=42,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-24'),
+ 43:
+ dict(
+ name='face-20',
+ id=43,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-23'),
+ 44:
+ dict(
+ name='face-21',
+ id=44,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-22'),
+ 45:
+ dict(
+ name='face-22',
+ id=45,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-21'),
+ 46:
+ dict(
+ name='face-23',
+ id=46,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-20'),
+ 47:
+ dict(
+ name='face-24',
+ id=47,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-19'),
+ 48:
+ dict(
+ name='face-25',
+ id=48,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-18'),
+ 49:
+ dict(
+ name='face-26',
+ id=49,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-17'),
+ 50:
+ dict(name='face-27', id=50, color=[247, 34, 5], type='upper', swap=''),
+ 51:
+ dict(name='face-28', id=51, color=[247, 34, 5], type='upper', swap=''),
+ 52:
+ dict(name='face-29', id=52, color=[247, 34, 5], type='upper', swap=''),
+ 53:
+ dict(name='face-30', id=53, color=[247, 34, 5], type='upper', swap=''),
+ 54:
+ dict(
+ name='face-31',
+ id=54,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-35'),
+ 55:
+ dict(
+ name='face-32',
+ id=55,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-34'),
+ 56:
+ dict(name='face-33', id=56, color=[247, 34, 5], type='upper', swap=''),
+ 57:
+ dict(
+ name='face-34',
+ id=57,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-32'),
+ 58:
+ dict(
+ name='face-35',
+ id=58,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-31'),
+ 59:
+ dict(
+ name='face-36',
+ id=59,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-45'),
+ 60:
+ dict(
+ name='face-37',
+ id=60,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-44'),
+ 61:
+ dict(
+ name='face-38',
+ id=61,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-43'),
+ 62:
+ dict(
+ name='face-39',
+ id=62,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-42'),
+ 63:
+ dict(
+ name='face-40',
+ id=63,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-47'),
+ 64:
+ dict(
+ name='face-41',
+ id=64,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-46'),
+ 65:
+ dict(
+ name='face-42',
+ id=65,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-39'),
+ 66:
+ dict(
+ name='face-43',
+ id=66,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-38'),
+ 67:
+ dict(
+ name='face-44',
+ id=67,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-37'),
+ 68:
+ dict(
+ name='face-45',
+ id=68,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-36'),
+ 69:
+ dict(
+ name='face-46',
+ id=69,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-41'),
+ 70:
+ dict(
+ name='face-47',
+ id=70,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-40'),
+ 71:
+ dict(
+ name='face-48',
+ id=71,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-54'),
+ 72:
+ dict(
+ name='face-49',
+ id=72,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-53'),
+ 73:
+ dict(
+ name='face-50',
+ id=73,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-52'),
+ 74:
+ dict(name='face-51', id=74, color=[247, 34, 5], type='upper', swap=''),
+ 75:
+ dict(
+ name='face-52',
+ id=75,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-50'),
+ 76:
+ dict(
+ name='face-53',
+ id=76,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-49'),
+ 77:
+ dict(
+ name='face-54',
+ id=77,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-48'),
+ 78:
+ dict(
+ name='face-55',
+ id=78,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-59'),
+ 79:
+ dict(
+ name='face-56',
+ id=79,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-58'),
+ 80:
+ dict(name='face-57', id=80, color=[247, 34, 5], type='upper', swap=''),
+ 81:
+ dict(
+ name='face-58',
+ id=81,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-56'),
+ 82:
+ dict(
+ name='face-59',
+ id=82,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-55'),
+ 83:
+ dict(
+ name='face-60',
+ id=83,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-64'),
+ 84:
+ dict(
+ name='face-61',
+ id=84,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-63'),
+ 85:
+ dict(name='face-62', id=85, color=[247, 34, 5], type='upper', swap=''),
+ 86:
+ dict(
+ name='face-63',
+ id=86,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-61'),
+ 87:
+ dict(
+ name='face-64',
+ id=87,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-60'),
+ 88:
+ dict(
+ name='face-65',
+ id=88,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-67'),
+ 89:
+ dict(name='face-66', id=89, color=[247, 34, 5], type='upper', swap=''),
+ 90:
+ dict(
+ name='face-67',
+ id=90,
+ color=[247, 34, 5],
+ type='upper',
+ swap='face-65'),
+ 91:
+ dict(
+ name='left_hand_root',
+ id=91,
+ color=[247, 34, 5],
+ type='',
+ swap='right_hand_root'),
+ 92:
+ dict(
+ name='left_thumb1',
+ id=92,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb1'),
+ 93:
+ dict(
+ name='left_thumb2',
+ id=93,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb2'),
+ 94:
+ dict(
+ name='left_thumb3',
+ id=94,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb3'),
+ 95:
+ dict(
+ name='left_thumb4',
+ id=95,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb4'),
+ 96:
+ dict(
+ name='left_forefinger1',
+ id=96,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger1'),
+ 97:
+ dict(
+ name='left_forefinger2',
+ id=97,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger2'),
+ 98:
+ dict(
+ name='left_forefinger3',
+ id=98,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger3'),
+ 99:
+ dict(
+ name='left_forefinger4',
+ id=99,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger4'),
+ 100:
+ dict(
+ name='left_middle_finger1',
+ id=100,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 101:
+ dict(
+ name='left_middle_finger2',
+ id=101,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 102:
+ dict(
+ name='left_middle_finger3',
+ id=102,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 103:
+ dict(
+ name='left_middle_finger4',
+ id=103,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 104:
+ dict(
+ name='left_ring_finger1',
+ id=104,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger1'),
+ 105:
+ dict(
+ name='left_ring_finger2',
+ id=105,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger2'),
+ 106:
+ dict(
+ name='left_ring_finger3',
+ id=106,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger3'),
+ 107:
+ dict(
+ name='left_ring_finger4',
+ id=107,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger4'),
+ 108:
+ dict(
+ name='left_pinky_finger1',
+ id=108,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger1'),
+ 109:
+ dict(
+ name='left_pinky_finger2',
+ id=109,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger2'),
+ 110:
+ dict(
+ name='left_pinky_finger3',
+ id=110,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger3'),
+ 111:
+ dict(
+ name='left_pinky_finger4',
+ id=111,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger4'),
+ 112:
+ dict(
+ name='right_hand_root',
+ id=112,
+ color=[247, 34, 5],
+ type='',
+ swap='left_hand_root'),
+ 113:
+ dict(
+ name='right_thumb1',
+ id=113,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb1'),
+ 114:
+ dict(
+ name='right_thumb2',
+ id=114,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb2'),
+ 115:
+ dict(
+ name='right_thumb3',
+ id=115,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb3'),
+ 116:
+ dict(
+ name='right_thumb4',
+ id=116,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb4'),
+ 117:
+ dict(
+ name='right_forefinger1',
+ id=117,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger1'),
+ 118:
+ dict(
+ name='right_forefinger2',
+ id=118,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger2'),
+ 119:
+ dict(
+ name='right_forefinger3',
+ id=119,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger3'),
+ 120:
+ dict(
+ name='right_forefinger4',
+ id=120,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger4'),
+ 121:
+ dict(
+ name='right_middle_finger1',
+ id=121,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 122:
+ dict(
+ name='right_middle_finger2',
+ id=122,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 123:
+ dict(
+ name='right_middle_finger3',
+ id=123,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 124:
+ dict(
+ name='right_middle_finger4',
+ id=124,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 125:
+ dict(
+ name='right_ring_finger1',
+ id=125,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger1'),
+ 126:
+ dict(
+ name='right_ring_finger2',
+ id=126,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger2'),
+ 127:
+ dict(
+ name='right_ring_finger3',
+ id=127,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger3'),
+ 128:
+ dict(
+ name='right_ring_finger4',
+ id=128,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger4'),
+ 129:
+ dict(
+ name='right_pinky_finger1',
+ id=129,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger1'),
+ 130:
+ dict(
+ name='right_pinky_finger2',
+ id=130,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger2'),
+ 131:
+ dict(
+ name='right_pinky_finger3',
+ id=131,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger3'),
+ 132:
+ dict(
+ name='right_pinky_finger4',
+ id=132,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger4')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+ 20:
+ dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+ 21:
+ dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=23,
+ color=[255, 128, 0]),
+ 24:
+ dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+ 0]),
+ 26:
+ dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+ 27:
+ dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+ 28:
+ dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+ 29:
+ dict(
+ link=('left_hand_root', 'left_forefinger1'),
+ id=29,
+ color=[255, 153, 255]),
+ 30:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=30,
+ color=[255, 153, 255]),
+ 31:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=31,
+ color=[255, 153, 255]),
+ 32:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=32,
+ color=[255, 153, 255]),
+ 33:
+ dict(
+ link=('left_hand_root', 'left_middle_finger1'),
+ id=33,
+ color=[102, 178, 255]),
+ 34:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=34,
+ color=[102, 178, 255]),
+ 35:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=35,
+ color=[102, 178, 255]),
+ 36:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=36,
+ color=[102, 178, 255]),
+ 37:
+ dict(
+ link=('left_hand_root', 'left_ring_finger1'),
+ id=37,
+ color=[255, 51, 51]),
+ 38:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=38,
+ color=[255, 51, 51]),
+ 39:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=39,
+ color=[255, 51, 51]),
+ 40:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=40,
+ color=[255, 51, 51]),
+ 41:
+ dict(
+ link=('left_hand_root', 'left_pinky_finger1'),
+ id=41,
+ color=[0, 255, 0]),
+ 42:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=42,
+ color=[0, 255, 0]),
+ 43:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=43,
+ color=[0, 255, 0]),
+ 44:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=44,
+ color=[0, 255, 0]),
+ 45:
+ dict(
+ link=('right_hand_root', 'right_thumb1'),
+ id=45,
+ color=[255, 128, 0]),
+ 46:
+ dict(
+ link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+ 47:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+ 48:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+ 49:
+ dict(
+ link=('right_hand_root', 'right_forefinger1'),
+ id=49,
+ color=[255, 153, 255]),
+ 50:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=50,
+ color=[255, 153, 255]),
+ 51:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=51,
+ color=[255, 153, 255]),
+ 52:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=52,
+ color=[255, 153, 255]),
+ 53:
+ dict(
+ link=('right_hand_root', 'right_middle_finger1'),
+ id=53,
+ color=[102, 178, 255]),
+ 54:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=54,
+ color=[102, 178, 255]),
+ 55:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=55,
+ color=[102, 178, 255]),
+ 56:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=56,
+ color=[102, 178, 255]),
+ 57:
+ dict(
+ link=('right_hand_root', 'right_ring_finger1'),
+ id=57,
+ color=[255, 51, 51]),
+ 58:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=58,
+ color=[255, 51, 51]),
+ 59:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=59,
+ color=[255, 51, 51]),
+ 60:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=60,
+ color=[255, 51, 51]),
+ 61:
+ dict(
+ link=('right_hand_root', 'right_pinky_finger1'),
+ id=61,
+ color=[0, 255, 0]),
+ 62:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=62,
+ color=[0, 255, 0]),
+ 63:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=63,
+ color=[0, 255, 0]),
+ 64:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=64,
+ color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 133,
+ # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+ # 'evaluation/myeval_wholebody.py#L175'
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+ 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+ 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+ 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+ 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+ 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+ 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+ 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+ 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+ 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+ 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+ 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+ 0.019, 0.022, 0.031
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/halpe.py b/modules/rtmpose/configs/_base_/datasets/halpe.py
new file mode 100644
index 0000000..cccf9f4
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/halpe.py
@@ -0,0 +1,1157 @@
+dataset_info = dict(
+ dataset_name='halpe',
+ paper_info=dict(
+ author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+ ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+ ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+ title='PaStaNet: Toward Human Activity Knowledge Engine',
+ container='CVPR',
+ year='2020',
+ homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+ 18:
+ dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+ 19:
+ dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+ 20:
+ dict(
+ name='left_big_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 21:
+ dict(
+ name='right_big_toe',
+ id=21,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 22:
+ dict(
+ name='left_small_toe',
+ id=22,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 23:
+ dict(
+ name='right_small_toe',
+ id=23,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 24:
+ dict(
+ name='left_heel',
+ id=24,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_heel'),
+ 25:
+ dict(
+ name='right_heel',
+ id=25,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_heel'),
+ 26:
+ dict(
+ name='face-0',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='face-16'),
+ 27:
+ dict(
+ name='face-1',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='face-15'),
+ 28:
+ dict(
+ name='face-2',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='face-14'),
+ 29:
+ dict(
+ name='face-3',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='face-13'),
+ 30:
+ dict(
+ name='face-4',
+ id=30,
+ color=[255, 255, 255],
+ type='',
+ swap='face-12'),
+ 31:
+ dict(
+ name='face-5',
+ id=31,
+ color=[255, 255, 255],
+ type='',
+ swap='face-11'),
+ 32:
+ dict(
+ name='face-6',
+ id=32,
+ color=[255, 255, 255],
+ type='',
+ swap='face-10'),
+ 33:
+ dict(
+ name='face-7',
+ id=33,
+ color=[255, 255, 255],
+ type='',
+ swap='face-9'),
+ 34:
+ dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''),
+ 35:
+ dict(
+ name='face-9',
+ id=35,
+ color=[255, 255, 255],
+ type='',
+ swap='face-7'),
+ 36:
+ dict(
+ name='face-10',
+ id=36,
+ color=[255, 255, 255],
+ type='',
+ swap='face-6'),
+ 37:
+ dict(
+ name='face-11',
+ id=37,
+ color=[255, 255, 255],
+ type='',
+ swap='face-5'),
+ 38:
+ dict(
+ name='face-12',
+ id=38,
+ color=[255, 255, 255],
+ type='',
+ swap='face-4'),
+ 39:
+ dict(
+ name='face-13',
+ id=39,
+ color=[255, 255, 255],
+ type='',
+ swap='face-3'),
+ 40:
+ dict(
+ name='face-14',
+ id=40,
+ color=[255, 255, 255],
+ type='',
+ swap='face-2'),
+ 41:
+ dict(
+ name='face-15',
+ id=41,
+ color=[255, 255, 255],
+ type='',
+ swap='face-1'),
+ 42:
+ dict(
+ name='face-16',
+ id=42,
+ color=[255, 255, 255],
+ type='',
+ swap='face-0'),
+ 43:
+ dict(
+ name='face-17',
+ id=43,
+ color=[255, 255, 255],
+ type='',
+ swap='face-26'),
+ 44:
+ dict(
+ name='face-18',
+ id=44,
+ color=[255, 255, 255],
+ type='',
+ swap='face-25'),
+ 45:
+ dict(
+ name='face-19',
+ id=45,
+ color=[255, 255, 255],
+ type='',
+ swap='face-24'),
+ 46:
+ dict(
+ name='face-20',
+ id=46,
+ color=[255, 255, 255],
+ type='',
+ swap='face-23'),
+ 47:
+ dict(
+ name='face-21',
+ id=47,
+ color=[255, 255, 255],
+ type='',
+ swap='face-22'),
+ 48:
+ dict(
+ name='face-22',
+ id=48,
+ color=[255, 255, 255],
+ type='',
+ swap='face-21'),
+ 49:
+ dict(
+ name='face-23',
+ id=49,
+ color=[255, 255, 255],
+ type='',
+ swap='face-20'),
+ 50:
+ dict(
+ name='face-24',
+ id=50,
+ color=[255, 255, 255],
+ type='',
+ swap='face-19'),
+ 51:
+ dict(
+ name='face-25',
+ id=51,
+ color=[255, 255, 255],
+ type='',
+ swap='face-18'),
+ 52:
+ dict(
+ name='face-26',
+ id=52,
+ color=[255, 255, 255],
+ type='',
+ swap='face-17'),
+ 53:
+ dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''),
+ 54:
+ dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''),
+ 55:
+ dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''),
+ 56:
+ dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''),
+ 57:
+ dict(
+ name='face-31',
+ id=57,
+ color=[255, 255, 255],
+ type='',
+ swap='face-35'),
+ 58:
+ dict(
+ name='face-32',
+ id=58,
+ color=[255, 255, 255],
+ type='',
+ swap='face-34'),
+ 59:
+ dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''),
+ 60:
+ dict(
+ name='face-34',
+ id=60,
+ color=[255, 255, 255],
+ type='',
+ swap='face-32'),
+ 61:
+ dict(
+ name='face-35',
+ id=61,
+ color=[255, 255, 255],
+ type='',
+ swap='face-31'),
+ 62:
+ dict(
+ name='face-36',
+ id=62,
+ color=[255, 255, 255],
+ type='',
+ swap='face-45'),
+ 63:
+ dict(
+ name='face-37',
+ id=63,
+ color=[255, 255, 255],
+ type='',
+ swap='face-44'),
+ 64:
+ dict(
+ name='face-38',
+ id=64,
+ color=[255, 255, 255],
+ type='',
+ swap='face-43'),
+ 65:
+ dict(
+ name='face-39',
+ id=65,
+ color=[255, 255, 255],
+ type='',
+ swap='face-42'),
+ 66:
+ dict(
+ name='face-40',
+ id=66,
+ color=[255, 255, 255],
+ type='',
+ swap='face-47'),
+ 67:
+ dict(
+ name='face-41',
+ id=67,
+ color=[255, 255, 255],
+ type='',
+ swap='face-46'),
+ 68:
+ dict(
+ name='face-42',
+ id=68,
+ color=[255, 255, 255],
+ type='',
+ swap='face-39'),
+ 69:
+ dict(
+ name='face-43',
+ id=69,
+ color=[255, 255, 255],
+ type='',
+ swap='face-38'),
+ 70:
+ dict(
+ name='face-44',
+ id=70,
+ color=[255, 255, 255],
+ type='',
+ swap='face-37'),
+ 71:
+ dict(
+ name='face-45',
+ id=71,
+ color=[255, 255, 255],
+ type='',
+ swap='face-36'),
+ 72:
+ dict(
+ name='face-46',
+ id=72,
+ color=[255, 255, 255],
+ type='',
+ swap='face-41'),
+ 73:
+ dict(
+ name='face-47',
+ id=73,
+ color=[255, 255, 255],
+ type='',
+ swap='face-40'),
+ 74:
+ dict(
+ name='face-48',
+ id=74,
+ color=[255, 255, 255],
+ type='',
+ swap='face-54'),
+ 75:
+ dict(
+ name='face-49',
+ id=75,
+ color=[255, 255, 255],
+ type='',
+ swap='face-53'),
+ 76:
+ dict(
+ name='face-50',
+ id=76,
+ color=[255, 255, 255],
+ type='',
+ swap='face-52'),
+ 77:
+ dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''),
+ 78:
+ dict(
+ name='face-52',
+ id=78,
+ color=[255, 255, 255],
+ type='',
+ swap='face-50'),
+ 79:
+ dict(
+ name='face-53',
+ id=79,
+ color=[255, 255, 255],
+ type='',
+ swap='face-49'),
+ 80:
+ dict(
+ name='face-54',
+ id=80,
+ color=[255, 255, 255],
+ type='',
+ swap='face-48'),
+ 81:
+ dict(
+ name='face-55',
+ id=81,
+ color=[255, 255, 255],
+ type='',
+ swap='face-59'),
+ 82:
+ dict(
+ name='face-56',
+ id=82,
+ color=[255, 255, 255],
+ type='',
+ swap='face-58'),
+ 83:
+ dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''),
+ 84:
+ dict(
+ name='face-58',
+ id=84,
+ color=[255, 255, 255],
+ type='',
+ swap='face-56'),
+ 85:
+ dict(
+ name='face-59',
+ id=85,
+ color=[255, 255, 255],
+ type='',
+ swap='face-55'),
+ 86:
+ dict(
+ name='face-60',
+ id=86,
+ color=[255, 255, 255],
+ type='',
+ swap='face-64'),
+ 87:
+ dict(
+ name='face-61',
+ id=87,
+ color=[255, 255, 255],
+ type='',
+ swap='face-63'),
+ 88:
+ dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''),
+ 89:
+ dict(
+ name='face-63',
+ id=89,
+ color=[255, 255, 255],
+ type='',
+ swap='face-61'),
+ 90:
+ dict(
+ name='face-64',
+ id=90,
+ color=[255, 255, 255],
+ type='',
+ swap='face-60'),
+ 91:
+ dict(
+ name='face-65',
+ id=91,
+ color=[255, 255, 255],
+ type='',
+ swap='face-67'),
+ 92:
+ dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''),
+ 93:
+ dict(
+ name='face-67',
+ id=93,
+ color=[255, 255, 255],
+ type='',
+ swap='face-65'),
+ 94:
+ dict(
+ name='left_hand_root',
+ id=94,
+ color=[255, 255, 255],
+ type='',
+ swap='right_hand_root'),
+ 95:
+ dict(
+ name='left_thumb1',
+ id=95,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb1'),
+ 96:
+ dict(
+ name='left_thumb2',
+ id=96,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb2'),
+ 97:
+ dict(
+ name='left_thumb3',
+ id=97,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb3'),
+ 98:
+ dict(
+ name='left_thumb4',
+ id=98,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb4'),
+ 99:
+ dict(
+ name='left_forefinger1',
+ id=99,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger1'),
+ 100:
+ dict(
+ name='left_forefinger2',
+ id=100,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger2'),
+ 101:
+ dict(
+ name='left_forefinger3',
+ id=101,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger3'),
+ 102:
+ dict(
+ name='left_forefinger4',
+ id=102,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger4'),
+ 103:
+ dict(
+ name='left_middle_finger1',
+ id=103,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 104:
+ dict(
+ name='left_middle_finger2',
+ id=104,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 105:
+ dict(
+ name='left_middle_finger3',
+ id=105,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 106:
+ dict(
+ name='left_middle_finger4',
+ id=106,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 107:
+ dict(
+ name='left_ring_finger1',
+ id=107,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger1'),
+ 108:
+ dict(
+ name='left_ring_finger2',
+ id=108,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger2'),
+ 109:
+ dict(
+ name='left_ring_finger3',
+ id=109,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger3'),
+ 110:
+ dict(
+ name='left_ring_finger4',
+ id=110,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger4'),
+ 111:
+ dict(
+ name='left_pinky_finger1',
+ id=111,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger1'),
+ 112:
+ dict(
+ name='left_pinky_finger2',
+ id=112,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger2'),
+ 113:
+ dict(
+ name='left_pinky_finger3',
+ id=113,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger3'),
+ 114:
+ dict(
+ name='left_pinky_finger4',
+ id=114,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger4'),
+ 115:
+ dict(
+ name='right_hand_root',
+ id=115,
+ color=[255, 255, 255],
+ type='',
+ swap='left_hand_root'),
+ 116:
+ dict(
+ name='right_thumb1',
+ id=116,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb1'),
+ 117:
+ dict(
+ name='right_thumb2',
+ id=117,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb2'),
+ 118:
+ dict(
+ name='right_thumb3',
+ id=118,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb3'),
+ 119:
+ dict(
+ name='right_thumb4',
+ id=119,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb4'),
+ 120:
+ dict(
+ name='right_forefinger1',
+ id=120,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger1'),
+ 121:
+ dict(
+ name='right_forefinger2',
+ id=121,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger2'),
+ 122:
+ dict(
+ name='right_forefinger3',
+ id=122,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger3'),
+ 123:
+ dict(
+ name='right_forefinger4',
+ id=123,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger4'),
+ 124:
+ dict(
+ name='right_middle_finger1',
+ id=124,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 125:
+ dict(
+ name='right_middle_finger2',
+ id=125,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 126:
+ dict(
+ name='right_middle_finger3',
+ id=126,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 127:
+ dict(
+ name='right_middle_finger4',
+ id=127,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 128:
+ dict(
+ name='right_ring_finger1',
+ id=128,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger1'),
+ 129:
+ dict(
+ name='right_ring_finger2',
+ id=129,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger2'),
+ 130:
+ dict(
+ name='right_ring_finger3',
+ id=130,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger3'),
+ 131:
+ dict(
+ name='right_ring_finger4',
+ id=131,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger4'),
+ 132:
+ dict(
+ name='right_pinky_finger1',
+ id=132,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger1'),
+ 133:
+ dict(
+ name='right_pinky_finger2',
+ id=133,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger2'),
+ 134:
+ dict(
+ name='right_pinky_finger3',
+ id=134,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger3'),
+ 135:
+ dict(
+ name='right_pinky_finger4',
+ id=135,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger4')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+ 3:
+ dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+ 5:
+ dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+ 0]),
+ 13:
+ dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+ 14:
+ dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+ 20:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+ 21:
+ dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+ 23:
+ dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+ 24:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=25,
+ color=[255, 128, 0]),
+ 26:
+ dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+ 27:
+ dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]),
+ 28:
+ dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]),
+ 29:
+ dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]),
+ 30:
+ dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]),
+ 31:
+ dict(
+ link=('left_wrist', 'left_forefinger1'),
+ id=31,
+ color=[255, 153, 255]),
+ 32:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=32,
+ color=[255, 153, 255]),
+ 33:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=33,
+ color=[255, 153, 255]),
+ 34:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=34,
+ color=[255, 153, 255]),
+ 35:
+ dict(
+ link=('left_wrist', 'left_middle_finger1'),
+ id=35,
+ color=[102, 178, 255]),
+ 36:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=36,
+ color=[102, 178, 255]),
+ 37:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=37,
+ color=[102, 178, 255]),
+ 38:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=38,
+ color=[102, 178, 255]),
+ 39:
+ dict(
+ link=('left_wrist', 'left_ring_finger1'),
+ id=39,
+ color=[255, 51, 51]),
+ 40:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=40,
+ color=[255, 51, 51]),
+ 41:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=41,
+ color=[255, 51, 51]),
+ 42:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=42,
+ color=[255, 51, 51]),
+ 43:
+ dict(
+ link=('left_wrist', 'left_pinky_finger1'),
+ id=43,
+ color=[0, 255, 0]),
+ 44:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=44,
+ color=[0, 255, 0]),
+ 45:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=45,
+ color=[0, 255, 0]),
+ 46:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=46,
+ color=[0, 255, 0]),
+ 47:
+ dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]),
+ 48:
+ dict(
+ link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]),
+ 49:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]),
+ 50:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]),
+ 51:
+ dict(
+ link=('right_wrist', 'right_forefinger1'),
+ id=51,
+ color=[255, 153, 255]),
+ 52:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=52,
+ color=[255, 153, 255]),
+ 53:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=53,
+ color=[255, 153, 255]),
+ 54:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=54,
+ color=[255, 153, 255]),
+ 55:
+ dict(
+ link=('right_wrist', 'right_middle_finger1'),
+ id=55,
+ color=[102, 178, 255]),
+ 56:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=56,
+ color=[102, 178, 255]),
+ 57:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=57,
+ color=[102, 178, 255]),
+ 58:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=58,
+ color=[102, 178, 255]),
+ 59:
+ dict(
+ link=('right_wrist', 'right_ring_finger1'),
+ id=59,
+ color=[255, 51, 51]),
+ 60:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=60,
+ color=[255, 51, 51]),
+ 61:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=61,
+ color=[255, 51, 51]),
+ 62:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=62,
+ color=[255, 51, 51]),
+ 63:
+ dict(
+ link=('right_wrist', 'right_pinky_finger1'),
+ id=63,
+ color=[0, 255, 0]),
+ 64:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=64,
+ color=[0, 255, 0]),
+ 65:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=65,
+ color=[0, 255, 0]),
+ 66:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=66,
+ color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 136,
+
+ # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+ # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08,
+ 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+ 0.015, 0.015, 0.015, 0.015, 0.015, 0.015
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/halpe26.py b/modules/rtmpose/configs/_base_/datasets/halpe26.py
new file mode 100644
index 0000000..7f4d549
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/halpe26.py
@@ -0,0 +1,274 @@
+dataset_info = dict(
+ dataset_name='halpe26',
+ paper_info=dict(
+ author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+ ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+ ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+ title='PaStaNet: Toward Human Activity Knowledge Engine',
+ container='CVPR',
+ year='2020',
+ homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+ 18:
+ dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+ 19:
+ dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+ 20:
+ dict(
+ name='left_big_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 21:
+ dict(
+ name='right_big_toe',
+ id=21,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 22:
+ dict(
+ name='left_small_toe',
+ id=22,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 23:
+ dict(
+ name='right_small_toe',
+ id=23,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 24:
+ dict(
+ name='left_heel',
+ id=24,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_heel'),
+ 25:
+ dict(
+ name='right_heel',
+ id=25,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_heel')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+ 3:
+ dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+ 5:
+ dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+ 0]),
+ 13:
+ dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+ 14:
+ dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+ 20:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+ 21:
+ dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+ 23:
+ dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+ 24:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=25,
+ color=[255, 128, 0]),
+ 26:
+ dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+ },
+ # the joint_weights is modified by MMPose Team
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ] + [1., 1., 1.2] + [1.5] * 6,
+
+ # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+ # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+ sigmas=[
+ 0.026,
+ 0.025,
+ 0.025,
+ 0.035,
+ 0.035,
+ 0.079,
+ 0.079,
+ 0.072,
+ 0.072,
+ 0.062,
+ 0.062,
+ 0.107,
+ 0.107,
+ 0.087,
+ 0.087,
+ 0.089,
+ 0.089,
+ 0.026,
+ 0.026,
+ 0.066,
+ 0.079,
+ 0.079,
+ 0.079,
+ 0.079,
+ 0.079,
+ 0.079,
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/horse10.py b/modules/rtmpose/configs/_base_/datasets/horse10.py
new file mode 100644
index 0000000..60cec1f
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/horse10.py
@@ -0,0 +1,201 @@
+dataset_info = dict(
+ dataset_name='horse10',
+ paper_info=dict(
+ author='Mathis, Alexander and Biasi, Thomas and '
+ 'Schneider, Steffen and '
+ 'Yuksekgonul, Mert and Rogers, Byron and '
+ 'Bethge, Matthias and '
+ 'Mathis, Mackenzie W',
+ title='Pretraining boosts out-of-domain robustness '
+ 'for pose estimation',
+ container='Proceedings of the IEEE/CVF Winter Conference on '
+ 'Applications of Computer Vision',
+ year='2021',
+ homepage='http://www.mackenziemathislab.org/horse10',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''),
+ 1:
+ dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''),
+ 2:
+ dict(
+ name='Nearknee',
+ id=2,
+ color=[255, 102, 255],
+ type='upper',
+ swap=''),
+ 3:
+ dict(
+ name='Nearfrontfetlock',
+ id=3,
+ color=[255, 102, 255],
+ type='upper',
+ swap=''),
+ 4:
+ dict(
+ name='Nearfrontfoot',
+ id=4,
+ color=[255, 102, 255],
+ type='upper',
+ swap=''),
+ 5:
+ dict(
+ name='Offknee', id=5, color=[255, 102, 255], type='upper',
+ swap=''),
+ 6:
+ dict(
+ name='Offfrontfetlock',
+ id=6,
+ color=[255, 102, 255],
+ type='upper',
+ swap=''),
+ 7:
+ dict(
+ name='Offfrontfoot',
+ id=7,
+ color=[255, 102, 255],
+ type='upper',
+ swap=''),
+ 8:
+ dict(
+ name='Shoulder',
+ id=8,
+ color=[255, 153, 255],
+ type='upper',
+ swap=''),
+ 9:
+ dict(
+ name='Midshoulder',
+ id=9,
+ color=[255, 153, 255],
+ type='upper',
+ swap=''),
+ 10:
+ dict(
+ name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''),
+ 11:
+ dict(
+ name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''),
+ 12:
+ dict(
+ name='Wither', id=12, color=[255, 153, 255], type='upper',
+ swap=''),
+ 13:
+ dict(
+ name='Nearhindhock',
+ id=13,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 14:
+ dict(
+ name='Nearhindfetlock',
+ id=14,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 15:
+ dict(
+ name='Nearhindfoot',
+ id=15,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 16:
+ dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''),
+ 17:
+ dict(
+ name='Stifle', id=17, color=[255, 153, 255], type='lower',
+ swap=''),
+ 18:
+ dict(
+ name='Offhindhock',
+ id=18,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 19:
+ dict(
+ name='Offhindfetlock',
+ id=19,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 20:
+ dict(
+ name='Offhindfoot',
+ id=20,
+ color=[255, 51, 255],
+ type='lower',
+ swap=''),
+ 21:
+ dict(
+ name='Ischium',
+ id=21,
+ color=[255, 153, 255],
+ type='lower',
+ swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]),
+ 1:
+ dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]),
+ 2:
+ dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]),
+ 3:
+ dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]),
+ 4:
+ dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]),
+ 9:
+ dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]),
+ 10:
+ dict(
+ link=('Nearknee', 'Nearfrontfetlock'),
+ id=10,
+ color=[255, 102, 255]),
+ 11:
+ dict(
+ link=('Nearfrontfetlock', 'Nearfrontfoot'),
+ id=11,
+ color=[255, 102, 255]),
+ 12:
+ dict(
+ link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]),
+ 13:
+ dict(
+ link=('Offfrontfetlock', 'Offfrontfoot'),
+ id=13,
+ color=[255, 102, 255]),
+ 14:
+ dict(
+ link=('Nearhindhock', 'Nearhindfetlock'),
+ id=14,
+ color=[255, 51, 255]),
+ 15:
+ dict(
+ link=('Nearhindfetlock', 'Nearhindfoot'),
+ id=15,
+ color=[255, 51, 255]),
+ 16:
+ dict(
+ link=('Offhindhock', 'Offhindfetlock'),
+ id=16,
+ color=[255, 51, 255]),
+ 17:
+ dict(
+ link=('Offhindfetlock', 'Offhindfoot'),
+ id=17,
+ color=[255, 51, 255])
+ },
+ joint_weights=[1.] * 22,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/humanart.py b/modules/rtmpose/configs/_base_/datasets/humanart.py
new file mode 100644
index 0000000..8054926
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/humanart.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+ dataset_name='Human-Art',
+ paper_info=dict(
+ author='Ju, Xuan and Zeng, Ailing and '
+ 'Wang, Jianan and Xu, Qiang and Zhang, Lei',
+ title='Human-Art: A Versatile Human-Centric Dataset '
+ 'Bridging Natural and Artificial Scenes',
+ container='Proceedings of the IEEE/CVF Conference on '
+ 'Computer Vision and Pattern Recognition',
+ year='2023',
+ homepage='https://idea-research.github.io/HumanArt/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/humanart21.py b/modules/rtmpose/configs/_base_/datasets/humanart21.py
new file mode 100644
index 0000000..b8cb226
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/humanart21.py
@@ -0,0 +1,218 @@
+dataset_info = dict(
+ dataset_name='Human-Art',
+ paper_info=dict(
+ author='Ju, Xuan and Zeng, Ailing and '
+ 'Wang, Jianan and Xu, Qiang and Zhang, Lei',
+ title='Human-Art: A Versatile Human-Centric Dataset '
+ 'Bridging Natural and Artificial Scenes',
+ container='Proceedings of the IEEE/CVF Conference on '
+ 'Computer Vision and Pattern Recognition',
+ year='2023',
+ homepage='https://idea-research.github.io/HumanArt/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='left_finger',
+ id=17,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_finger'),
+ 18:
+ dict(
+ name='right_finger',
+ id=18,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_finger'),
+ 19:
+ dict(
+ name='left_toe',
+ id=19,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_toe'),
+ 20:
+ dict(
+ name='right_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_toe'),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ankle', 'left_toe'), id=19, color=[0, 255, 0]),
+ 20:
+ dict(link=('right_ankle', 'right_toe'), id=20, color=[255, 128, 0]),
+ 21:
+ dict(link=('left_wrist', 'left_finger'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(link=('right_wrist', 'right_finger'), id=22, color=[255, 128, 0]),
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5, 1., 1., 1., 1.
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089, 0.089,
+ 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/humanart_aic.py b/modules/rtmpose/configs/_base_/datasets/humanart_aic.py
new file mode 100644
index 0000000..573e9a0
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/humanart_aic.py
@@ -0,0 +1,205 @@
+dataset_info = dict(
+ dataset_name='humanart',
+ paper_info=[
+ dict(
+ author='Ju, Xuan and Zeng, Ailing and '
+ 'Wang, Jianan and Xu, Qiang and Zhang, '
+ 'Lei',
+ title='Human-Art: A Versatile Human-Centric Dataset '
+ 'Bridging Natural and Artificial Scenes',
+ container='CVPR',
+ year='2023',
+ homepage='https://idea-research.github.io/HumanArt/',
+ ),
+ dict(
+ author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+ 'Li, Yixin and Yan, Baoming and Liang, Rui and '
+ 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+ 'Fu, Yanwei and others',
+ title='Ai challenger: A large-scale dataset for going '
+ 'deeper in image understanding',
+ container='arXiv',
+ year='2017',
+ homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+ ),
+ ],
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='head_top',
+ id=17,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 18:
+ dict(name='neck', id=18, color=[51, 153, 255], type='upper', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5, 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.026, 0.026
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/interhand2d.py b/modules/rtmpose/configs/_base_/datasets/interhand2d.py
new file mode 100644
index 0000000..e60dfc7
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/interhand2d.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+ dataset_name='interhand2d',
+ paper_info=dict(
+ author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+ 'Shiratori, Takaaki and Lee, Kyoung Mu',
+ title='InterHand2.6M: A dataset and baseline for 3D '
+ 'interacting hand pose estimation from a single RGB image',
+ container='arXiv',
+ year='2020',
+ homepage='https://mks0601.github.io/InterHand2.6M/',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''),
+ 1:
+ dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(
+ name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='middle_finger4',
+ id=8,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 9:
+ dict(
+ name='middle_finger3',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger1',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''),
+ 13:
+ dict(
+ name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''),
+ 17:
+ dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/interhand3d.py b/modules/rtmpose/configs/_base_/datasets/interhand3d.py
new file mode 100644
index 0000000..26b7ccf
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/interhand3d.py
@@ -0,0 +1,487 @@
+dataset_info = dict(
+ dataset_name='interhand3d',
+ paper_info=dict(
+ author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+ 'Shiratori, Takaaki and Lee, Kyoung Mu',
+ title='InterHand2.6M: A dataset and baseline for 3D '
+ 'interacting hand pose estimation from a single RGB image',
+ container='arXiv',
+ year='2020',
+ homepage='https://mks0601.github.io/InterHand2.6M/',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_thumb4',
+ id=0,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb4'),
+ 1:
+ dict(
+ name='right_thumb3',
+ id=1,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb3'),
+ 2:
+ dict(
+ name='right_thumb2',
+ id=2,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb2'),
+ 3:
+ dict(
+ name='right_thumb1',
+ id=3,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb1'),
+ 4:
+ dict(
+ name='right_forefinger4',
+ id=4,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger4'),
+ 5:
+ dict(
+ name='right_forefinger3',
+ id=5,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger3'),
+ 6:
+ dict(
+ name='right_forefinger2',
+ id=6,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger2'),
+ 7:
+ dict(
+ name='right_forefinger1',
+ id=7,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger1'),
+ 8:
+ dict(
+ name='right_middle_finger4',
+ id=8,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 9:
+ dict(
+ name='right_middle_finger3',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 10:
+ dict(
+ name='right_middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 11:
+ dict(
+ name='right_middle_finger1',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 12:
+ dict(
+ name='right_ring_finger4',
+ id=12,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger4'),
+ 13:
+ dict(
+ name='right_ring_finger3',
+ id=13,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger3'),
+ 14:
+ dict(
+ name='right_ring_finger2',
+ id=14,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger2'),
+ 15:
+ dict(
+ name='right_ring_finger1',
+ id=15,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger1'),
+ 16:
+ dict(
+ name='right_pinky_finger4',
+ id=16,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger4'),
+ 17:
+ dict(
+ name='right_pinky_finger3',
+ id=17,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger3'),
+ 18:
+ dict(
+ name='right_pinky_finger2',
+ id=18,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger2'),
+ 19:
+ dict(
+ name='right_pinky_finger1',
+ id=19,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger1'),
+ 20:
+ dict(
+ name='right_wrist',
+ id=20,
+ color=[255, 255, 255],
+ type='',
+ swap='left_wrist'),
+ 21:
+ dict(
+ name='left_thumb4',
+ id=21,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb4'),
+ 22:
+ dict(
+ name='left_thumb3',
+ id=22,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb3'),
+ 23:
+ dict(
+ name='left_thumb2',
+ id=23,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb2'),
+ 24:
+ dict(
+ name='left_thumb1',
+ id=24,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb1'),
+ 25:
+ dict(
+ name='left_forefinger4',
+ id=25,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger4'),
+ 26:
+ dict(
+ name='left_forefinger3',
+ id=26,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger3'),
+ 27:
+ dict(
+ name='left_forefinger2',
+ id=27,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger2'),
+ 28:
+ dict(
+ name='left_forefinger1',
+ id=28,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger1'),
+ 29:
+ dict(
+ name='left_middle_finger4',
+ id=29,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 30:
+ dict(
+ name='left_middle_finger3',
+ id=30,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 31:
+ dict(
+ name='left_middle_finger2',
+ id=31,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 32:
+ dict(
+ name='left_middle_finger1',
+ id=32,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 33:
+ dict(
+ name='left_ring_finger4',
+ id=33,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger4'),
+ 34:
+ dict(
+ name='left_ring_finger3',
+ id=34,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger3'),
+ 35:
+ dict(
+ name='left_ring_finger2',
+ id=35,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger2'),
+ 36:
+ dict(
+ name='left_ring_finger1',
+ id=36,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger1'),
+ 37:
+ dict(
+ name='left_pinky_finger4',
+ id=37,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger4'),
+ 38:
+ dict(
+ name='left_pinky_finger3',
+ id=38,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger3'),
+ 39:
+ dict(
+ name='left_pinky_finger2',
+ id=39,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger2'),
+ 40:
+ dict(
+ name='left_pinky_finger1',
+ id=40,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger1'),
+ 41:
+ dict(
+ name='left_wrist',
+ id=41,
+ color=[255, 255, 255],
+ type='',
+ swap='right_wrist'),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(
+ link=('right_wrist', 'right_forefinger1'),
+ id=4,
+ color=[255, 153, 255]),
+ 5:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=5,
+ color=[255, 153, 255]),
+ 6:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=6,
+ color=[255, 153, 255]),
+ 7:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=7,
+ color=[255, 153, 255]),
+ 8:
+ dict(
+ link=('right_wrist', 'right_middle_finger1'),
+ id=8,
+ color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(
+ link=('right_wrist', 'right_ring_finger1'),
+ id=12,
+ color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=13,
+ color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=14,
+ color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=15,
+ color=[255, 51, 51]),
+ 16:
+ dict(
+ link=('right_wrist', 'right_pinky_finger1'),
+ id=16,
+ color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=17,
+ color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=18,
+ color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=19,
+ color=[0, 255, 0]),
+ 20:
+ dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]),
+ 21:
+ dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]),
+ 22:
+ dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]),
+ 24:
+ dict(
+ link=('left_wrist', 'left_forefinger1'),
+ id=24,
+ color=[255, 153, 255]),
+ 25:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=25,
+ color=[255, 153, 255]),
+ 26:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=26,
+ color=[255, 153, 255]),
+ 27:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=27,
+ color=[255, 153, 255]),
+ 28:
+ dict(
+ link=('left_wrist', 'left_middle_finger1'),
+ id=28,
+ color=[102, 178, 255]),
+ 29:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=29,
+ color=[102, 178, 255]),
+ 30:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=30,
+ color=[102, 178, 255]),
+ 31:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=31,
+ color=[102, 178, 255]),
+ 32:
+ dict(
+ link=('left_wrist', 'left_ring_finger1'),
+ id=32,
+ color=[255, 51, 51]),
+ 33:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=33,
+ color=[255, 51, 51]),
+ 34:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=34,
+ color=[255, 51, 51]),
+ 35:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=35,
+ color=[255, 51, 51]),
+ 36:
+ dict(
+ link=('left_wrist', 'left_pinky_finger1'),
+ id=36,
+ color=[0, 255, 0]),
+ 37:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=37,
+ color=[0, 255, 0]),
+ 38:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=38,
+ color=[0, 255, 0]),
+ 39:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=39,
+ color=[0, 255, 0]),
+ },
+ joint_weights=[1.] * 42,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/jhmdb.py b/modules/rtmpose/configs/_base_/datasets/jhmdb.py
new file mode 100644
index 0000000..1f931fc
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/jhmdb.py
@@ -0,0 +1,129 @@
+dataset_info = dict(
+ dataset_name='jhmdb',
+ paper_info=dict(
+ author='H. Jhuang and J. Gall and S. Zuffi and '
+ 'C. Schmid and M. J. Black',
+ title='Towards understanding action recognition',
+ container='International Conf. on Computer Vision (ICCV)',
+ year='2013',
+ homepage='http://jhmdb.is.tue.mpg.de/dataset',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''),
+ 1:
+ dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''),
+ 2:
+ dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''),
+ 3:
+ dict(
+ name='right_shoulder',
+ id=3,
+ color=[0, 255, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 4:
+ dict(
+ name='left_shoulder',
+ id=4,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 5:
+ dict(
+ name='right_hip',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='left_hip'),
+ 6:
+ dict(
+ name='left_hip',
+ id=6,
+ color=[51, 153, 255],
+ type='lower',
+ swap='right_hip'),
+ 7:
+ dict(
+ name='right_elbow',
+ id=7,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_elbow'),
+ 8:
+ dict(
+ name='left_elbow',
+ id=8,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_elbow'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[51, 153, 255],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='left_knee',
+ id=10,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_knee'),
+ 11:
+ dict(
+ name='right_wrist',
+ id=11,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 12:
+ dict(
+ name='left_wrist',
+ id=12,
+ color=[255, 128, 0],
+ type='upper',
+ swap='right_wrist'),
+ 13:
+ dict(
+ name='right_ankle',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='left_ankle'),
+ 14:
+ dict(
+ name='left_ankle',
+ id=14,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle')
+ },
+ skeleton_info={
+ 0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+ 1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+ 2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]),
+ 3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+ 5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+ 6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]),
+ 7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]),
+ 8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]),
+ 9: dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]),
+ 11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]),
+ 12:
+ dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]),
+ 13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5
+ ],
+ # Adapted from COCO dataset.
+ sigmas=[
+ 0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087,
+ 0.087, 0.062, 0.062, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/lapa.py b/modules/rtmpose/configs/_base_/datasets/lapa.py
new file mode 100644
index 0000000..56ff1e6
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/lapa.py
@@ -0,0 +1,246 @@
+dataset_info = dict(
+ dataset_name='lapa',
+ paper_info=dict(
+ author='Liu, Yinglu and Shi, Hailin and Shen, Hao and Si, '
+ 'Yue and Wang, Xiaobo and Mei, Tao',
+ title='A New Dataset and Boundary-Attention Semantic '
+ 'Segmentation for Face Parsing.',
+ container='Proceedings of the AAAI Conference on '
+ 'Artificial Intelligence 2020',
+ year='2020',
+ homepage='https://github.com/JDAI-CV/lapa-dataset',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-32'),
+ 1:
+ dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-31'),
+ 2:
+ dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-30'),
+ 3:
+ dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-29'),
+ 4:
+ dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-28'),
+ 5:
+ dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-27'),
+ 6:
+ dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-26'),
+ 7:
+ dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-25'),
+ 8:
+ dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-24'),
+ 9:
+ dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-23'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-22'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-21'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-20'),
+ 13:
+ dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-19'),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-18'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'),
+ 16:
+ dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'),
+ 18:
+ dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-14'),
+ 19:
+ dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-13'),
+ 20:
+ dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-12'),
+ 21:
+ dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-11'),
+ 22:
+ dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-10'),
+ 23:
+ dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-9'),
+ 24:
+ dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-8'),
+ 25:
+ dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-7'),
+ 26:
+ dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-6'),
+ 27:
+ dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap='kpt-5'),
+ 28:
+ dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='kpt-4'),
+ 29:
+ dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap='kpt-3'),
+ 30:
+ dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap='kpt-2'),
+ 31:
+ dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-1'),
+ 32:
+ dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-0'),
+ 33:
+ dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap='kpt-46'),
+ 34:
+ dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-45'),
+ 35:
+ dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-44'),
+ 36:
+ dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-43'),
+ 37:
+ dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-42'),
+ 38:
+ dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-50'),
+ 39:
+ dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-49'),
+ 40:
+ dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-48'),
+ 41:
+ dict(name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-47'),
+ 42:
+ dict(name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-37'),
+ 43:
+ dict(name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-36'),
+ 44:
+ dict(name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-35'),
+ 45:
+ dict(name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-34'),
+ 46:
+ dict(name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-33'),
+ 47:
+ dict(name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-41'),
+ 48:
+ dict(name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-40'),
+ 49:
+ dict(name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-39'),
+ 50:
+ dict(name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-38'),
+ 51:
+ dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''),
+ 52:
+ dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''),
+ 53:
+ dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''),
+ 54:
+ dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''),
+ 55:
+ dict(name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-65'),
+ 56:
+ dict(name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-64'),
+ 57:
+ dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap='kpt-63'),
+ 58:
+ dict(name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-62'),
+ 59:
+ dict(name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-61'),
+ 60:
+ dict(name='kpt-60', id=60, color=[255, 0, 0], type='', swap=''),
+ 61:
+ dict(name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-59'),
+ 62:
+ dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap='kpt-58'),
+ 63:
+ dict(name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-57'),
+ 64:
+ dict(name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-56'),
+ 65:
+ dict(name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-55'),
+ 66:
+ dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap='kpt-79'),
+ 67:
+ dict(name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-78'),
+ 68:
+ dict(name='kpt-68', id=68, color=[255, 0, 0], type='', swap='kpt-77'),
+ 69:
+ dict(name='kpt-69', id=69, color=[255, 0, 0], type='', swap='kpt-76'),
+ 70:
+ dict(name='kpt-70', id=70, color=[255, 0, 0], type='', swap='kpt-75'),
+ 71:
+ dict(name='kpt-71', id=71, color=[255, 0, 0], type='', swap='kpt-82'),
+ 72:
+ dict(name='kpt-72', id=72, color=[255, 0, 0], type='', swap='kpt-81'),
+ 73:
+ dict(name='kpt-73', id=73, color=[255, 0, 0], type='', swap='kpt-80'),
+ 74:
+ dict(name='kpt-74', id=74, color=[255, 0, 0], type='', swap='kpt-83'),
+ 75:
+ dict(name='kpt-75', id=75, color=[255, 0, 0], type='', swap='kpt-70'),
+ 76:
+ dict(name='kpt-76', id=76, color=[255, 0, 0], type='', swap='kpt-69'),
+ 77:
+ dict(name='kpt-77', id=77, color=[255, 0, 0], type='', swap='kpt-68'),
+ 78:
+ dict(name='kpt-78', id=78, color=[255, 0, 0], type='', swap='kpt-67'),
+ 79:
+ dict(name='kpt-79', id=79, color=[255, 0, 0], type='', swap='kpt-66'),
+ 80:
+ dict(name='kpt-80', id=80, color=[255, 0, 0], type='', swap='kpt-73'),
+ 81:
+ dict(name='kpt-81', id=81, color=[255, 0, 0], type='', swap='kpt-72'),
+ 82:
+ dict(name='kpt-82', id=82, color=[255, 0, 0], type='', swap='kpt-71'),
+ 83:
+ dict(name='kpt-83', id=83, color=[255, 0, 0], type='', swap='kpt-74'),
+ 84:
+ dict(name='kpt-84', id=84, color=[255, 0, 0], type='', swap='kpt-90'),
+ 85:
+ dict(name='kpt-85', id=85, color=[255, 0, 0], type='', swap='kpt-89'),
+ 86:
+ dict(name='kpt-86', id=86, color=[255, 0, 0], type='', swap='kpt-88'),
+ 87:
+ dict(name='kpt-87', id=87, color=[255, 0, 0], type='', swap=''),
+ 88:
+ dict(name='kpt-88', id=88, color=[255, 0, 0], type='', swap='kpt-86'),
+ 89:
+ dict(name='kpt-89', id=89, color=[255, 0, 0], type='', swap='kpt-85'),
+ 90:
+ dict(name='kpt-90', id=90, color=[255, 0, 0], type='', swap='kpt-84'),
+ 91:
+ dict(name='kpt-91', id=91, color=[255, 0, 0], type='', swap='kpt-95'),
+ 92:
+ dict(name='kpt-92', id=92, color=[255, 0, 0], type='', swap='kpt-94'),
+ 93:
+ dict(name='kpt-93', id=93, color=[255, 0, 0], type='', swap=''),
+ 94:
+ dict(name='kpt-94', id=94, color=[255, 0, 0], type='', swap='kpt-92'),
+ 95:
+ dict(name='kpt-95', id=95, color=[255, 0, 0], type='', swap='kpt-91'),
+ 96:
+ dict(name='kpt-96', id=96, color=[255, 0, 0], type='', swap='kpt-100'),
+ 97:
+ dict(name='kpt-97', id=97, color=[255, 0, 0], type='', swap='kpt-99'),
+ 98:
+ dict(name='kpt-98', id=98, color=[255, 0, 0], type='', swap=''),
+ 99:
+ dict(name='kpt-99', id=99, color=[255, 0, 0], type='', swap='kpt-97'),
+ 100:
+ dict(
+ name='kpt-100', id=100, color=[255, 0, 0], type='', swap='kpt-96'),
+ 101:
+ dict(
+ name='kpt-101', id=101, color=[255, 0, 0], type='',
+ swap='kpt-103'),
+ 102:
+ dict(name='kpt-102', id=102, color=[255, 0, 0], type='', swap=''),
+ 103:
+ dict(
+ name='kpt-103', id=103, color=[255, 0, 0], type='',
+ swap='kpt-101'),
+ 104:
+ dict(
+ name='kpt-104', id=104, color=[255, 0, 0], type='',
+ swap='kpt-105'),
+ 105:
+ dict(
+ name='kpt-105', id=105, color=[255, 0, 0], type='', swap='kpt-104')
+ },
+ skeleton_info={},
+ joint_weights=[
+ 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8,
+ 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8,
+ 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0,
+ 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0,
+ 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5,
+ 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.0, 1.0
+ ],
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/locust.py b/modules/rtmpose/configs/_base_/datasets/locust.py
new file mode 100644
index 0000000..3a6fafd
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/locust.py
@@ -0,0 +1,263 @@
+dataset_info = dict(
+ dataset_name='locust',
+ paper_info=dict(
+ author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+ 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+ 'Couzin, Iain D',
+ title='DeepPoseKit, a software toolkit for fast and robust '
+ 'animal pose estimation using deep learning',
+ container='Elife',
+ year='2019',
+ homepage='https://github.com/jgraving/DeepPoseKit-Data',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''),
+ 2:
+ dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''),
+ 3:
+ dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''),
+ 4:
+ dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''),
+ 5:
+ dict(
+ name='anttipL',
+ id=5,
+ color=[255, 255, 255],
+ type='',
+ swap='anttipR'),
+ 6:
+ dict(
+ name='antbaseL',
+ id=6,
+ color=[255, 255, 255],
+ type='',
+ swap='antbaseR'),
+ 7:
+ dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'),
+ 8:
+ dict(
+ name='forelegL1',
+ id=8,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR1'),
+ 9:
+ dict(
+ name='forelegL2',
+ id=9,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR2'),
+ 10:
+ dict(
+ name='forelegL3',
+ id=10,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR3'),
+ 11:
+ dict(
+ name='forelegL4',
+ id=11,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR4'),
+ 12:
+ dict(
+ name='midlegL1',
+ id=12,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR1'),
+ 13:
+ dict(
+ name='midlegL2',
+ id=13,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR2'),
+ 14:
+ dict(
+ name='midlegL3',
+ id=14,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR3'),
+ 15:
+ dict(
+ name='midlegL4',
+ id=15,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegR4'),
+ 16:
+ dict(
+ name='hindlegL1',
+ id=16,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR1'),
+ 17:
+ dict(
+ name='hindlegL2',
+ id=17,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR2'),
+ 18:
+ dict(
+ name='hindlegL3',
+ id=18,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR3'),
+ 19:
+ dict(
+ name='hindlegL4',
+ id=19,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR4'),
+ 20:
+ dict(
+ name='anttipR',
+ id=20,
+ color=[255, 255, 255],
+ type='',
+ swap='anttipL'),
+ 21:
+ dict(
+ name='antbaseR',
+ id=21,
+ color=[255, 255, 255],
+ type='',
+ swap='antbaseL'),
+ 22:
+ dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'),
+ 23:
+ dict(
+ name='forelegR1',
+ id=23,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL1'),
+ 24:
+ dict(
+ name='forelegR2',
+ id=24,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL2'),
+ 25:
+ dict(
+ name='forelegR3',
+ id=25,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL3'),
+ 26:
+ dict(
+ name='forelegR4',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL4'),
+ 27:
+ dict(
+ name='midlegR1',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL1'),
+ 28:
+ dict(
+ name='midlegR2',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL2'),
+ 29:
+ dict(
+ name='midlegR3',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL3'),
+ 30:
+ dict(
+ name='midlegR4',
+ id=30,
+ color=[255, 255, 255],
+ type='',
+ swap='midlegL4'),
+ 31:
+ dict(
+ name='hindlegR1',
+ id=31,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL1'),
+ 32:
+ dict(
+ name='hindlegR2',
+ id=32,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL2'),
+ 33:
+ dict(
+ name='hindlegR3',
+ id=33,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL3'),
+ 34:
+ dict(
+ name='hindlegR4',
+ id=34,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL4')
+ },
+ skeleton_info={
+ 0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]),
+ 1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]),
+ 2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]),
+ 3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]),
+ 4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]),
+ 5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]),
+ 6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]),
+ 7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]),
+ 8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]),
+ 9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]),
+ 10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]),
+ 11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]),
+ 12:
+ dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]),
+ 13:
+ dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]),
+ 14:
+ dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]),
+ 15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]),
+ 16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]),
+ 17:
+ dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]),
+ 18:
+ dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]),
+ 19:
+ dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]),
+ 20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]),
+ 21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]),
+ 22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]),
+ 23:
+ dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]),
+ 24:
+ dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]),
+ 25:
+ dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255])
+ },
+ joint_weights=[1.] * 35,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/macaque.py b/modules/rtmpose/configs/_base_/datasets/macaque.py
new file mode 100644
index 0000000..926ca30
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/macaque.py
@@ -0,0 +1,183 @@
+dataset_info = dict(
+ dataset_name='macaque',
+ paper_info=dict(
+ author='Labuguen, Rollyn and Matsumoto, Jumpei and '
+ 'Negrete, Salvador and Nishimaru, Hiroshi and '
+ 'Nishijo, Hisao and Takada, Masahiko and '
+ 'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro',
+ title='MacaquePose: A novel "in the wild" macaque monkey pose dataset '
+ 'for markerless motion capture',
+ container='bioRxiv',
+ year='2020',
+ homepage='http://www.pri.kyoto-u.ac.jp/datasets/'
+ 'macaquepose/index.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/mhp.py b/modules/rtmpose/configs/_base_/datasets/mhp.py
new file mode 100644
index 0000000..9c8c03c
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/mhp.py
@@ -0,0 +1,156 @@
+dataset_info = dict(
+ dataset_name='mhp',
+ paper_info=dict(
+ author='Zhao, Jian and Li, Jianshu and Cheng, Yu and '
+ 'Sim, Terence and Yan, Shuicheng and Feng, Jiashi',
+ title='Understanding humans in crowded scenes: '
+ 'Deep nested adversarial learning and a '
+ 'new benchmark for multi-human parsing',
+ container='Proceedings of the 26th ACM '
+ 'international conference on Multimedia',
+ year='2018',
+ homepage='https://lv-mhp.github.io/dataset',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_ankle',
+ id=0,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 1:
+ dict(
+ name='right_knee',
+ id=1,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 2:
+ dict(
+ name='right_hip',
+ id=2,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 3:
+ dict(
+ name='left_hip',
+ id=3,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 4:
+ dict(
+ name='left_knee',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 5:
+ dict(
+ name='left_ankle',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 6:
+ dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+ 7:
+ dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+ 8:
+ dict(
+ name='upper_neck',
+ id=8,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 9:
+ dict(
+ name='head_top', id=9, color=[51, 153, 255], type='upper',
+ swap=''),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='right_elbow',
+ id=11,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 12:
+ dict(
+ name='right_shoulder',
+ id=12,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 13:
+ dict(
+ name='left_shoulder',
+ id=13,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 14:
+ dict(
+ name='left_elbow',
+ id=14,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 15:
+ dict(
+ name='left_wrist',
+ id=15,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+ 5:
+ dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+ 6:
+ dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+ 9:
+ dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+ 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+ 14:
+ dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+ },
+ joint_weights=[
+ 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+ ],
+ # Adapted from COCO dataset.
+ sigmas=[
+ 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+ 0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py b/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py
new file mode 100644
index 0000000..ed088c2
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/mpi_inf_3dhp.py
@@ -0,0 +1,132 @@
+dataset_info = dict(
+ dataset_name='mpi_inf_3dhp',
+ paper_info=dict(
+ author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and '
+ 'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and '
+ 'Theobalt, Christian',
+ title='Monocular 3D Human Pose Estimation In The Wild Using Improved '
+ 'CNN Supervision',
+ container='2017 international conference on 3D vision (3DV)',
+ year='2017',
+ homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='head_top', id=0, color=[51, 153, 255], type='upper',
+ swap=''),
+ 1:
+ dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''),
+ 2:
+ dict(
+ name='right_shoulder',
+ id=2,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='right_wrist',
+ id=4,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='left_elbow',
+ id=6,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 7:
+ dict(
+ name='left_wrist',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 8:
+ dict(
+ name='right_hip',
+ id=8,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='right_ankle',
+ id=10,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='left_knee',
+ id=12,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 13:
+ dict(
+ name='left_ankle',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 14:
+ dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''),
+ 15:
+ dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''),
+ 16:
+ dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='')
+ },
+ skeleton_info={
+ 0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]),
+ 1: dict(
+ link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]),
+ 3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+ 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+ 6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]),
+ 7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]),
+ 8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]),
+ 9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]),
+ 10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+ 11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]),
+ 12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]),
+ 13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]),
+ 14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]),
+ 15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255])
+ },
+ joint_weights=[1.] * 17,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/mpii.py b/modules/rtmpose/configs/_base_/datasets/mpii.py
new file mode 100644
index 0000000..2723bae
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/mpii.py
@@ -0,0 +1,155 @@
+dataset_info = dict(
+ dataset_name='mpii',
+ paper_info=dict(
+ author='Mykhaylo Andriluka and Leonid Pishchulin and '
+ 'Peter Gehler and Schiele, Bernt',
+ title='2D Human Pose Estimation: New Benchmark and '
+ 'State of the Art Analysis',
+ container='IEEE Conference on Computer Vision and '
+ 'Pattern Recognition (CVPR)',
+ year='2014',
+ homepage='http://human-pose.mpi-inf.mpg.de/',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_ankle',
+ id=0,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 1:
+ dict(
+ name='right_knee',
+ id=1,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 2:
+ dict(
+ name='right_hip',
+ id=2,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 3:
+ dict(
+ name='left_hip',
+ id=3,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 4:
+ dict(
+ name='left_knee',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 5:
+ dict(
+ name='left_ankle',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 6:
+ dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+ 7:
+ dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+ 8:
+ dict(
+ name='upper_neck',
+ id=8,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 9:
+ dict(
+ name='head_top', id=9, color=[51, 153, 255], type='upper',
+ swap=''),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='right_elbow',
+ id=11,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 12:
+ dict(
+ name='right_shoulder',
+ id=12,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 13:
+ dict(
+ name='left_shoulder',
+ id=13,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 14:
+ dict(
+ name='left_elbow',
+ id=14,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 15:
+ dict(
+ name='left_wrist',
+ id=15,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+ 5:
+ dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+ 6:
+ dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+ 9:
+ dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+ 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+ 14:
+ dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+ },
+ joint_weights=[
+ 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+ ],
+ # Adapted from COCO dataset.
+ sigmas=[
+ 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+ 0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/mpii_trb.py b/modules/rtmpose/configs/_base_/datasets/mpii_trb.py
new file mode 100644
index 0000000..ddb7e9e
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/mpii_trb.py
@@ -0,0 +1,380 @@
+dataset_info = dict(
+ dataset_name='mpii_trb',
+ paper_info=dict(
+ author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and '
+ 'Liu, Wentao and Qian, Chen and Ouyang, Wanli',
+ title='TRB: A Novel Triplet Representation for '
+ 'Understanding 2D Human Body',
+ container='Proceedings of the IEEE International '
+ 'Conference on Computer Vision',
+ year='2019',
+ homepage='https://github.com/kennymckormick/'
+ 'Triplet-Representation-of-human-Body',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='left_shoulder',
+ id=0,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 1:
+ dict(
+ name='right_shoulder',
+ id=1,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 2:
+ dict(
+ name='left_elbow',
+ id=2,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 3:
+ dict(
+ name='right_elbow',
+ id=3,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 4:
+ dict(
+ name='left_wrist',
+ id=4,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 5:
+ dict(
+ name='right_wrist',
+ id=5,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 6:
+ dict(
+ name='left_hip',
+ id=6,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 7:
+ dict(
+ name='right_hip',
+ id=7,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 8:
+ dict(
+ name='left_knee',
+ id=8,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 9:
+ dict(
+ name='right_knee',
+ id=9,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 10:
+ dict(
+ name='left_ankle',
+ id=10,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 11:
+ dict(
+ name='right_ankle',
+ id=11,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 12:
+ dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+ 13:
+ dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''),
+ 14:
+ dict(
+ name='right_neck',
+ id=14,
+ color=[255, 255, 255],
+ type='upper',
+ swap='left_neck'),
+ 15:
+ dict(
+ name='left_neck',
+ id=15,
+ color=[255, 255, 255],
+ type='upper',
+ swap='right_neck'),
+ 16:
+ dict(
+ name='medial_right_shoulder',
+ id=16,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_left_shoulder'),
+ 17:
+ dict(
+ name='lateral_right_shoulder',
+ id=17,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_left_shoulder'),
+ 18:
+ dict(
+ name='medial_right_bow',
+ id=18,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_left_bow'),
+ 19:
+ dict(
+ name='lateral_right_bow',
+ id=19,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_left_bow'),
+ 20:
+ dict(
+ name='medial_right_wrist',
+ id=20,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_left_wrist'),
+ 21:
+ dict(
+ name='lateral_right_wrist',
+ id=21,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_left_wrist'),
+ 22:
+ dict(
+ name='medial_left_shoulder',
+ id=22,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_right_shoulder'),
+ 23:
+ dict(
+ name='lateral_left_shoulder',
+ id=23,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_right_shoulder'),
+ 24:
+ dict(
+ name='medial_left_bow',
+ id=24,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_right_bow'),
+ 25:
+ dict(
+ name='lateral_left_bow',
+ id=25,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_right_bow'),
+ 26:
+ dict(
+ name='medial_left_wrist',
+ id=26,
+ color=[255, 255, 255],
+ type='upper',
+ swap='medial_right_wrist'),
+ 27:
+ dict(
+ name='lateral_left_wrist',
+ id=27,
+ color=[255, 255, 255],
+ type='upper',
+ swap='lateral_right_wrist'),
+ 28:
+ dict(
+ name='medial_right_hip',
+ id=28,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_left_hip'),
+ 29:
+ dict(
+ name='lateral_right_hip',
+ id=29,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_left_hip'),
+ 30:
+ dict(
+ name='medial_right_knee',
+ id=30,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_left_knee'),
+ 31:
+ dict(
+ name='lateral_right_knee',
+ id=31,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_left_knee'),
+ 32:
+ dict(
+ name='medial_right_ankle',
+ id=32,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_left_ankle'),
+ 33:
+ dict(
+ name='lateral_right_ankle',
+ id=33,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_left_ankle'),
+ 34:
+ dict(
+ name='medial_left_hip',
+ id=34,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_right_hip'),
+ 35:
+ dict(
+ name='lateral_left_hip',
+ id=35,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_right_hip'),
+ 36:
+ dict(
+ name='medial_left_knee',
+ id=36,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_right_knee'),
+ 37:
+ dict(
+ name='lateral_left_knee',
+ id=37,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_right_knee'),
+ 38:
+ dict(
+ name='medial_left_ankle',
+ id=38,
+ color=[255, 255, 255],
+ type='lower',
+ swap='medial_right_ankle'),
+ 39:
+ dict(
+ name='lateral_left_ankle',
+ id=39,
+ color=[255, 255, 255],
+ type='lower',
+ swap='lateral_right_ankle'),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+ 1:
+ dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+ 2:
+ dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+ 3:
+ dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+ 5:
+ dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+ 6:
+ dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+ 7:
+ dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]),
+ 8:
+ dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]),
+ 9:
+ dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]),
+ 10:
+ dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]),
+ 14:
+ dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]),
+ 15:
+ dict(
+ link=('medial_right_shoulder', 'lateral_right_shoulder'),
+ id=15,
+ color=[255, 255, 255]),
+ 16:
+ dict(
+ link=('medial_right_bow', 'lateral_right_bow'),
+ id=16,
+ color=[255, 255, 255]),
+ 17:
+ dict(
+ link=('medial_right_wrist', 'lateral_right_wrist'),
+ id=17,
+ color=[255, 255, 255]),
+ 18:
+ dict(
+ link=('medial_left_shoulder', 'lateral_left_shoulder'),
+ id=18,
+ color=[255, 255, 255]),
+ 19:
+ dict(
+ link=('medial_left_bow', 'lateral_left_bow'),
+ id=19,
+ color=[255, 255, 255]),
+ 20:
+ dict(
+ link=('medial_left_wrist', 'lateral_left_wrist'),
+ id=20,
+ color=[255, 255, 255]),
+ 21:
+ dict(
+ link=('medial_right_hip', 'lateral_right_hip'),
+ id=21,
+ color=[255, 255, 255]),
+ 22:
+ dict(
+ link=('medial_right_knee', 'lateral_right_knee'),
+ id=22,
+ color=[255, 255, 255]),
+ 23:
+ dict(
+ link=('medial_right_ankle', 'lateral_right_ankle'),
+ id=23,
+ color=[255, 255, 255]),
+ 24:
+ dict(
+ link=('medial_left_hip', 'lateral_left_hip'),
+ id=24,
+ color=[255, 255, 255]),
+ 25:
+ dict(
+ link=('medial_left_knee', 'lateral_left_knee'),
+ id=25,
+ color=[255, 255, 255]),
+ 26:
+ dict(
+ link=('medial_left_ankle', 'lateral_left_ankle'),
+ id=26,
+ color=[255, 255, 255])
+ },
+ joint_weights=[1.] * 40,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/ochuman.py b/modules/rtmpose/configs/_base_/datasets/ochuman.py
new file mode 100644
index 0000000..e6e86ba
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/ochuman.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+ dataset_name='ochuman',
+ paper_info=dict(
+ author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and '
+ 'Rosin, Paul and Cai, Zixi and Han, Xi and '
+ 'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min',
+ title='Pose2seg: Detection free human instance segmentation',
+ container='Proceedings of the IEEE conference on computer '
+ 'vision and pattern recognition',
+ year='2019',
+ homepage='https://github.com/liruilong940607/OCHumanApi',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/onehand10k.py b/modules/rtmpose/configs/_base_/datasets/onehand10k.py
new file mode 100644
index 0000000..833f186
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/onehand10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+ dataset_name='onehand10k',
+ paper_info=dict(
+ author='Wang, Yangang and Peng, Cong and Liu, Yebin',
+ title='Mask-pose cascaded cnn for 2d hand pose estimation '
+ 'from single color image',
+ container='IEEE Transactions on Circuits and Systems '
+ 'for Video Technology',
+ year='2018',
+ homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+ 9:
+ dict(
+ name='middle_finger1',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger3',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='middle_finger4',
+ id=12,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 13:
+ dict(
+ name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(
+ name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+ 17:
+ dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py b/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py
new file mode 100644
index 0000000..6623409
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/panoptic_body3d.py
@@ -0,0 +1,160 @@
+dataset_info = dict(
+ dataset_name='panoptic_pose_3d',
+ paper_info=dict(
+ author='Joo, Hanbyul and Simon, Tomas and Li, Xulong'
+ 'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean'
+ 'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain'
+ 'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser',
+ title='Panoptic Studio: A Massively Multiview System '
+ 'for Interaction Motion Capture',
+ container='IEEE Transactions on Pattern Analysis'
+ ' and Machine Intelligence',
+ year='2017',
+ homepage='http://domedb.perception.cs.cmu.edu',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''),
+ 2:
+ dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''),
+ 3:
+ dict(
+ name='left_shoulder',
+ id=3,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 4:
+ dict(
+ name='left_elbow',
+ id=4,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 5:
+ dict(
+ name='left_wrist',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 6:
+ dict(
+ name='left_hip',
+ id=6,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 7:
+ dict(
+ name='left_knee',
+ id=7,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 8:
+ dict(
+ name='left_ankle',
+ id=8,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 9:
+ dict(
+ name='right_shoulder',
+ id=9,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 10:
+ dict(
+ name='right_elbow',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 11:
+ dict(
+ name='right_wrist',
+ id=11,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='right_knee',
+ id=13,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 14:
+ dict(
+ name='right_ankle',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 15:
+ dict(
+ name='left_eye',
+ id=15,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 16:
+ dict(
+ name='left_ear',
+ id=16,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 17:
+ dict(
+ name='right_eye',
+ id=17,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 18:
+ dict(
+ name='right_ear',
+ id=18,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear')
+ },
+ skeleton_info={
+ 0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]),
+ 1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]),
+ 2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]),
+ 3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+ 4: dict(
+ link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+ 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+ 6:
+ dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+ 7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]),
+ 8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]),
+ 9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]),
+ 10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]),
+ 11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]),
+ 12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]),
+ 13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]),
+ },
+ joint_weights=[
+ 1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2,
+ 1.5, 1.0, 1.0, 1.0, 1.0
+ ],
+ sigmas=[
+ 0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079,
+ 0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py b/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py
new file mode 100644
index 0000000..5d01b9a
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/panoptic_hand2d.py
@@ -0,0 +1,143 @@
+dataset_info = dict(
+ dataset_name='panoptic_hand2d',
+ paper_info=dict(
+ author='Simon, Tomas and Joo, Hanbyul and '
+ 'Matthews, Iain and Sheikh, Yaser',
+ title='Hand keypoint detection in single images using '
+ 'multiview bootstrapping',
+ container='Proceedings of the IEEE conference on '
+ 'Computer Vision and Pattern Recognition',
+ year='2017',
+ homepage='http://domedb.perception.cs.cmu.edu/handdb.html',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+ 9:
+ dict(
+ name='middle_finger1',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger2',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger3',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='middle_finger4',
+ id=12,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 13:
+ dict(
+ name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(
+ name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+ 17:
+ dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/posetrack18.py b/modules/rtmpose/configs/_base_/datasets/posetrack18.py
new file mode 100644
index 0000000..18e1891
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/posetrack18.py
@@ -0,0 +1,176 @@
+dataset_info = dict(
+ dataset_name='posetrack18',
+ paper_info=dict(
+ author='Andriluka, Mykhaylo and Iqbal, Umar and '
+ 'Insafutdinov, Eldar and Pishchulin, Leonid and '
+ 'Milan, Anton and Gall, Juergen and Schiele, Bernt',
+ title='Posetrack: A benchmark for human pose estimation and tracking',
+ container='Proceedings of the IEEE Conference on '
+ 'Computer Vision and Pattern Recognition',
+ year='2018',
+ homepage='https://posetrack.net/users/download.php',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='head_bottom',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 2:
+ dict(
+ name='head_top', id=2, color=[51, 153, 255], type='upper',
+ swap=''),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(
+ link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153,
+ 255]),
+ 15:
+ dict(
+ link=('head_bottom', 'right_shoulder'),
+ id=15,
+ color=[51, 153, 255])
+ },
+ joint_weights=[
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+ 1.5
+ ],
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/rhd2d.py b/modules/rtmpose/configs/_base_/datasets/rhd2d.py
new file mode 100644
index 0000000..8829b15
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/rhd2d.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+ dataset_name='rhd2d',
+ paper_info=dict(
+ author='Christian Zimmermann and Thomas Brox',
+ title='Learning to Estimate 3D Hand Pose from Single RGB Images',
+ container='arXiv',
+ year='2017',
+ homepage='https://lmb.informatik.uni-freiburg.de/resources/'
+ 'datasets/RenderedHandposeDataset.en.html',
+ ),
+ # In RHD, 1-4: left thumb [tip to palm], which means the finger is from
+ # tip to palm, so as other fingers. Please refer to
+ # `https://lmb.informatik.uni-freiburg.de/resources/datasets/
+ # RenderedHandpose/README` for details of keypoint definition.
+ # But in COCO-WholeBody-Hand, FreiHand, CMU Panoptic HandDB, it is in
+ # inverse order. Pay attention to this if you want to combine RHD with
+ # other hand datasets to train a single model.
+ # Also, note that 'keypoint_info' will not directly affect the order of
+ # the keypoint in the dataset. It is mostly for visualization & storing
+ # information about flip_pairs.
+ keypoint_info={
+ 0:
+ dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='thumb4', id=1, color=[255, 128, 0], type='', swap=''),
+ 2:
+ dict(name='thumb3', id=2, color=[255, 128, 0], type='', swap=''),
+ 3:
+ dict(name='thumb2', id=3, color=[255, 128, 0], type='', swap=''),
+ 4:
+ dict(name='thumb1', id=4, color=[255, 128, 0], type='', swap=''),
+ 5:
+ dict(
+ name='forefinger4', id=5, color=[255, 153, 255], type='', swap=''),
+ 6:
+ dict(
+ name='forefinger3', id=6, color=[255, 153, 255], type='', swap=''),
+ 7:
+ dict(
+ name='forefinger2', id=7, color=[255, 153, 255], type='', swap=''),
+ 8:
+ dict(
+ name='forefinger1', id=8, color=[255, 153, 255], type='', swap=''),
+ 9:
+ dict(
+ name='middle_finger4',
+ id=9,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 10:
+ dict(
+ name='middle_finger3',
+ id=10,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 11:
+ dict(
+ name='middle_finger2',
+ id=11,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 12:
+ dict(
+ name='middle_finger1',
+ id=12,
+ color=[102, 178, 255],
+ type='',
+ swap=''),
+ 13:
+ dict(
+ name='ring_finger4', id=13, color=[255, 51, 51], type='', swap=''),
+ 14:
+ dict(
+ name='ring_finger3', id=14, color=[255, 51, 51], type='', swap=''),
+ 15:
+ dict(
+ name='ring_finger2', id=15, color=[255, 51, 51], type='', swap=''),
+ 16:
+ dict(
+ name='ring_finger1', id=16, color=[255, 51, 51], type='', swap=''),
+ 17:
+ dict(name='pinky_finger4', id=17, color=[0, 255, 0], type='', swap=''),
+ 18:
+ dict(name='pinky_finger3', id=18, color=[0, 255, 0], type='', swap=''),
+ 19:
+ dict(name='pinky_finger2', id=19, color=[0, 255, 0], type='', swap=''),
+ 20:
+ dict(name='pinky_finger1', id=20, color=[0, 255, 0], type='', swap='')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+ 5:
+ dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+ 6:
+ dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+ 7:
+ dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+ 8:
+ dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+ 9:
+ dict(
+ link=('middle_finger1', 'middle_finger2'),
+ id=9,
+ color=[102, 178, 255]),
+ 10:
+ dict(
+ link=('middle_finger2', 'middle_finger3'),
+ id=10,
+ color=[102, 178, 255]),
+ 11:
+ dict(
+ link=('middle_finger3', 'middle_finger4'),
+ id=11,
+ color=[102, 178, 255]),
+ 12:
+ dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+ 13:
+ dict(
+ link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+ 14:
+ dict(
+ link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+ 15:
+ dict(
+ link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+ 16:
+ dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+ 17:
+ dict(
+ link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+ 18:
+ dict(
+ link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+ 19:
+ dict(
+ link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 21,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/shelf.py b/modules/rtmpose/configs/_base_/datasets/shelf.py
new file mode 100644
index 0000000..6a7984b
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/shelf.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+ dataset_name='shelf',
+ paper_info=dict(
+ author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+ 'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+ title='3D Pictorial Structures for Multiple Human Pose Estimation',
+ container='IEEE Computer Society Conference on Computer Vision and '
+ 'Pattern Recognition (CVPR)',
+ year='2014',
+ homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+ ),
+ keypoint_info={
+ 0:
+ dict(
+ name='right_ankle',
+ id=0,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 1:
+ dict(
+ name='right_knee',
+ id=1,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 2:
+ dict(
+ name='right_hip',
+ id=2,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 3:
+ dict(
+ name='left_hip',
+ id=3,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 4:
+ dict(
+ name='left_knee',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 5:
+ dict(
+ name='left_ankle',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 6:
+ dict(
+ name='right_wrist',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 7:
+ dict(
+ name='right_elbow',
+ id=7,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 8:
+ dict(
+ name='right_shoulder',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 9:
+ dict(
+ name='left_shoulder',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 10:
+ dict(
+ name='left_elbow',
+ id=10,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 11:
+ dict(
+ name='left_wrist',
+ id=11,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 12:
+ dict(
+ name='bottom_head',
+ id=12,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ 13:
+ dict(
+ name='top_head',
+ id=13,
+ color=[51, 153, 255],
+ type='upper',
+ swap=''),
+ },
+ skeleton_info={
+ 0:
+ dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+ 1:
+ dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+ 2:
+ dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+ 3:
+ dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+ 4:
+ dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+ 6:
+ dict(
+ link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+ 7:
+ dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+ 8:
+ dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(
+ link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+ 0]),
+ 12:
+ dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+ 13:
+ dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+ },
+ joint_weights=[
+ 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+ ],
+ sigmas=[
+ 0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+ 0.072, 0.062, 0.026, 0.026
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/ubody2d.py b/modules/rtmpose/configs/_base_/datasets/ubody2d.py
new file mode 100644
index 0000000..8e63299
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/ubody2d.py
@@ -0,0 +1,1153 @@
+dataset_info = dict(
+ dataset_name='ubody2d',
+ paper_info=dict(
+ author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li',
+ title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware'
+ 'Transformer',
+ container='IEEE Computer Society Conference on Computer Vision and '
+ 'Pattern Recognition (CVPR)',
+ year='2023',
+ homepage='https://github.com/IDEA-Research/OSX',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+ 1:
+ dict(
+ name='left_eye',
+ id=1,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_eye'),
+ 2:
+ dict(
+ name='right_eye',
+ id=2,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_eye'),
+ 3:
+ dict(
+ name='left_ear',
+ id=3,
+ color=[51, 153, 255],
+ type='upper',
+ swap='right_ear'),
+ 4:
+ dict(
+ name='right_ear',
+ id=4,
+ color=[51, 153, 255],
+ type='upper',
+ swap='left_ear'),
+ 5:
+ dict(
+ name='left_shoulder',
+ id=5,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_shoulder'),
+ 6:
+ dict(
+ name='right_shoulder',
+ id=6,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_shoulder'),
+ 7:
+ dict(
+ name='left_elbow',
+ id=7,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_elbow'),
+ 8:
+ dict(
+ name='right_elbow',
+ id=8,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_elbow'),
+ 9:
+ dict(
+ name='left_wrist',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='right_wrist'),
+ 10:
+ dict(
+ name='right_wrist',
+ id=10,
+ color=[255, 128, 0],
+ type='upper',
+ swap='left_wrist'),
+ 11:
+ dict(
+ name='left_hip',
+ id=11,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_hip'),
+ 12:
+ dict(
+ name='right_hip',
+ id=12,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_hip'),
+ 13:
+ dict(
+ name='left_knee',
+ id=13,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_knee'),
+ 14:
+ dict(
+ name='right_knee',
+ id=14,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_knee'),
+ 15:
+ dict(
+ name='left_ankle',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='right_ankle'),
+ 16:
+ dict(
+ name='right_ankle',
+ id=16,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_ankle'),
+ 17:
+ dict(
+ name='left_big_toe',
+ id=17,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_big_toe'),
+ 18:
+ dict(
+ name='left_small_toe',
+ id=18,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_small_toe'),
+ 19:
+ dict(
+ name='left_heel',
+ id=19,
+ color=[255, 128, 0],
+ type='lower',
+ swap='right_heel'),
+ 20:
+ dict(
+ name='right_big_toe',
+ id=20,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_big_toe'),
+ 21:
+ dict(
+ name='right_small_toe',
+ id=21,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_small_toe'),
+ 22:
+ dict(
+ name='right_heel',
+ id=22,
+ color=[255, 128, 0],
+ type='lower',
+ swap='left_heel'),
+ 23:
+ dict(
+ name='face-0',
+ id=23,
+ color=[255, 255, 255],
+ type='',
+ swap='face-16'),
+ 24:
+ dict(
+ name='face-1',
+ id=24,
+ color=[255, 255, 255],
+ type='',
+ swap='face-15'),
+ 25:
+ dict(
+ name='face-2',
+ id=25,
+ color=[255, 255, 255],
+ type='',
+ swap='face-14'),
+ 26:
+ dict(
+ name='face-3',
+ id=26,
+ color=[255, 255, 255],
+ type='',
+ swap='face-13'),
+ 27:
+ dict(
+ name='face-4',
+ id=27,
+ color=[255, 255, 255],
+ type='',
+ swap='face-12'),
+ 28:
+ dict(
+ name='face-5',
+ id=28,
+ color=[255, 255, 255],
+ type='',
+ swap='face-11'),
+ 29:
+ dict(
+ name='face-6',
+ id=29,
+ color=[255, 255, 255],
+ type='',
+ swap='face-10'),
+ 30:
+ dict(
+ name='face-7',
+ id=30,
+ color=[255, 255, 255],
+ type='',
+ swap='face-9'),
+ 31:
+ dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+ 32:
+ dict(
+ name='face-9',
+ id=32,
+ color=[255, 255, 255],
+ type='',
+ swap='face-7'),
+ 33:
+ dict(
+ name='face-10',
+ id=33,
+ color=[255, 255, 255],
+ type='',
+ swap='face-6'),
+ 34:
+ dict(
+ name='face-11',
+ id=34,
+ color=[255, 255, 255],
+ type='',
+ swap='face-5'),
+ 35:
+ dict(
+ name='face-12',
+ id=35,
+ color=[255, 255, 255],
+ type='',
+ swap='face-4'),
+ 36:
+ dict(
+ name='face-13',
+ id=36,
+ color=[255, 255, 255],
+ type='',
+ swap='face-3'),
+ 37:
+ dict(
+ name='face-14',
+ id=37,
+ color=[255, 255, 255],
+ type='',
+ swap='face-2'),
+ 38:
+ dict(
+ name='face-15',
+ id=38,
+ color=[255, 255, 255],
+ type='',
+ swap='face-1'),
+ 39:
+ dict(
+ name='face-16',
+ id=39,
+ color=[255, 255, 255],
+ type='',
+ swap='face-0'),
+ 40:
+ dict(
+ name='face-17',
+ id=40,
+ color=[255, 255, 255],
+ type='',
+ swap='face-26'),
+ 41:
+ dict(
+ name='face-18',
+ id=41,
+ color=[255, 255, 255],
+ type='',
+ swap='face-25'),
+ 42:
+ dict(
+ name='face-19',
+ id=42,
+ color=[255, 255, 255],
+ type='',
+ swap='face-24'),
+ 43:
+ dict(
+ name='face-20',
+ id=43,
+ color=[255, 255, 255],
+ type='',
+ swap='face-23'),
+ 44:
+ dict(
+ name='face-21',
+ id=44,
+ color=[255, 255, 255],
+ type='',
+ swap='face-22'),
+ 45:
+ dict(
+ name='face-22',
+ id=45,
+ color=[255, 255, 255],
+ type='',
+ swap='face-21'),
+ 46:
+ dict(
+ name='face-23',
+ id=46,
+ color=[255, 255, 255],
+ type='',
+ swap='face-20'),
+ 47:
+ dict(
+ name='face-24',
+ id=47,
+ color=[255, 255, 255],
+ type='',
+ swap='face-19'),
+ 48:
+ dict(
+ name='face-25',
+ id=48,
+ color=[255, 255, 255],
+ type='',
+ swap='face-18'),
+ 49:
+ dict(
+ name='face-26',
+ id=49,
+ color=[255, 255, 255],
+ type='',
+ swap='face-17'),
+ 50:
+ dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+ 51:
+ dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+ 52:
+ dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+ 53:
+ dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+ 54:
+ dict(
+ name='face-31',
+ id=54,
+ color=[255, 255, 255],
+ type='',
+ swap='face-35'),
+ 55:
+ dict(
+ name='face-32',
+ id=55,
+ color=[255, 255, 255],
+ type='',
+ swap='face-34'),
+ 56:
+ dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+ 57:
+ dict(
+ name='face-34',
+ id=57,
+ color=[255, 255, 255],
+ type='',
+ swap='face-32'),
+ 58:
+ dict(
+ name='face-35',
+ id=58,
+ color=[255, 255, 255],
+ type='',
+ swap='face-31'),
+ 59:
+ dict(
+ name='face-36',
+ id=59,
+ color=[255, 255, 255],
+ type='',
+ swap='face-45'),
+ 60:
+ dict(
+ name='face-37',
+ id=60,
+ color=[255, 255, 255],
+ type='',
+ swap='face-44'),
+ 61:
+ dict(
+ name='face-38',
+ id=61,
+ color=[255, 255, 255],
+ type='',
+ swap='face-43'),
+ 62:
+ dict(
+ name='face-39',
+ id=62,
+ color=[255, 255, 255],
+ type='',
+ swap='face-42'),
+ 63:
+ dict(
+ name='face-40',
+ id=63,
+ color=[255, 255, 255],
+ type='',
+ swap='face-47'),
+ 64:
+ dict(
+ name='face-41',
+ id=64,
+ color=[255, 255, 255],
+ type='',
+ swap='face-46'),
+ 65:
+ dict(
+ name='face-42',
+ id=65,
+ color=[255, 255, 255],
+ type='',
+ swap='face-39'),
+ 66:
+ dict(
+ name='face-43',
+ id=66,
+ color=[255, 255, 255],
+ type='',
+ swap='face-38'),
+ 67:
+ dict(
+ name='face-44',
+ id=67,
+ color=[255, 255, 255],
+ type='',
+ swap='face-37'),
+ 68:
+ dict(
+ name='face-45',
+ id=68,
+ color=[255, 255, 255],
+ type='',
+ swap='face-36'),
+ 69:
+ dict(
+ name='face-46',
+ id=69,
+ color=[255, 255, 255],
+ type='',
+ swap='face-41'),
+ 70:
+ dict(
+ name='face-47',
+ id=70,
+ color=[255, 255, 255],
+ type='',
+ swap='face-40'),
+ 71:
+ dict(
+ name='face-48',
+ id=71,
+ color=[255, 255, 255],
+ type='',
+ swap='face-54'),
+ 72:
+ dict(
+ name='face-49',
+ id=72,
+ color=[255, 255, 255],
+ type='',
+ swap='face-53'),
+ 73:
+ dict(
+ name='face-50',
+ id=73,
+ color=[255, 255, 255],
+ type='',
+ swap='face-52'),
+ 74:
+ dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+ 75:
+ dict(
+ name='face-52',
+ id=75,
+ color=[255, 255, 255],
+ type='',
+ swap='face-50'),
+ 76:
+ dict(
+ name='face-53',
+ id=76,
+ color=[255, 255, 255],
+ type='',
+ swap='face-49'),
+ 77:
+ dict(
+ name='face-54',
+ id=77,
+ color=[255, 255, 255],
+ type='',
+ swap='face-48'),
+ 78:
+ dict(
+ name='face-55',
+ id=78,
+ color=[255, 255, 255],
+ type='',
+ swap='face-59'),
+ 79:
+ dict(
+ name='face-56',
+ id=79,
+ color=[255, 255, 255],
+ type='',
+ swap='face-58'),
+ 80:
+ dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+ 81:
+ dict(
+ name='face-58',
+ id=81,
+ color=[255, 255, 255],
+ type='',
+ swap='face-56'),
+ 82:
+ dict(
+ name='face-59',
+ id=82,
+ color=[255, 255, 255],
+ type='',
+ swap='face-55'),
+ 83:
+ dict(
+ name='face-60',
+ id=83,
+ color=[255, 255, 255],
+ type='',
+ swap='face-64'),
+ 84:
+ dict(
+ name='face-61',
+ id=84,
+ color=[255, 255, 255],
+ type='',
+ swap='face-63'),
+ 85:
+ dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+ 86:
+ dict(
+ name='face-63',
+ id=86,
+ color=[255, 255, 255],
+ type='',
+ swap='face-61'),
+ 87:
+ dict(
+ name='face-64',
+ id=87,
+ color=[255, 255, 255],
+ type='',
+ swap='face-60'),
+ 88:
+ dict(
+ name='face-65',
+ id=88,
+ color=[255, 255, 255],
+ type='',
+ swap='face-67'),
+ 89:
+ dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+ 90:
+ dict(
+ name='face-67',
+ id=90,
+ color=[255, 255, 255],
+ type='',
+ swap='face-65'),
+ 91:
+ dict(
+ name='left_hand_root',
+ id=91,
+ color=[255, 255, 255],
+ type='',
+ swap='right_hand_root'),
+ 92:
+ dict(
+ name='left_thumb1',
+ id=92,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb1'),
+ 93:
+ dict(
+ name='left_thumb2',
+ id=93,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb2'),
+ 94:
+ dict(
+ name='left_thumb3',
+ id=94,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb3'),
+ 95:
+ dict(
+ name='left_thumb4',
+ id=95,
+ color=[255, 128, 0],
+ type='',
+ swap='right_thumb4'),
+ 96:
+ dict(
+ name='left_forefinger1',
+ id=96,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger1'),
+ 97:
+ dict(
+ name='left_forefinger2',
+ id=97,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger2'),
+ 98:
+ dict(
+ name='left_forefinger3',
+ id=98,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger3'),
+ 99:
+ dict(
+ name='left_forefinger4',
+ id=99,
+ color=[255, 153, 255],
+ type='',
+ swap='right_forefinger4'),
+ 100:
+ dict(
+ name='left_middle_finger1',
+ id=100,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger1'),
+ 101:
+ dict(
+ name='left_middle_finger2',
+ id=101,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger2'),
+ 102:
+ dict(
+ name='left_middle_finger3',
+ id=102,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger3'),
+ 103:
+ dict(
+ name='left_middle_finger4',
+ id=103,
+ color=[102, 178, 255],
+ type='',
+ swap='right_middle_finger4'),
+ 104:
+ dict(
+ name='left_ring_finger1',
+ id=104,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger1'),
+ 105:
+ dict(
+ name='left_ring_finger2',
+ id=105,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger2'),
+ 106:
+ dict(
+ name='left_ring_finger3',
+ id=106,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger3'),
+ 107:
+ dict(
+ name='left_ring_finger4',
+ id=107,
+ color=[255, 51, 51],
+ type='',
+ swap='right_ring_finger4'),
+ 108:
+ dict(
+ name='left_pinky_finger1',
+ id=108,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger1'),
+ 109:
+ dict(
+ name='left_pinky_finger2',
+ id=109,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger2'),
+ 110:
+ dict(
+ name='left_pinky_finger3',
+ id=110,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger3'),
+ 111:
+ dict(
+ name='left_pinky_finger4',
+ id=111,
+ color=[0, 255, 0],
+ type='',
+ swap='right_pinky_finger4'),
+ 112:
+ dict(
+ name='right_hand_root',
+ id=112,
+ color=[255, 255, 255],
+ type='',
+ swap='left_hand_root'),
+ 113:
+ dict(
+ name='right_thumb1',
+ id=113,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb1'),
+ 114:
+ dict(
+ name='right_thumb2',
+ id=114,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb2'),
+ 115:
+ dict(
+ name='right_thumb3',
+ id=115,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb3'),
+ 116:
+ dict(
+ name='right_thumb4',
+ id=116,
+ color=[255, 128, 0],
+ type='',
+ swap='left_thumb4'),
+ 117:
+ dict(
+ name='right_forefinger1',
+ id=117,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger1'),
+ 118:
+ dict(
+ name='right_forefinger2',
+ id=118,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger2'),
+ 119:
+ dict(
+ name='right_forefinger3',
+ id=119,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger3'),
+ 120:
+ dict(
+ name='right_forefinger4',
+ id=120,
+ color=[255, 153, 255],
+ type='',
+ swap='left_forefinger4'),
+ 121:
+ dict(
+ name='right_middle_finger1',
+ id=121,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger1'),
+ 122:
+ dict(
+ name='right_middle_finger2',
+ id=122,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger2'),
+ 123:
+ dict(
+ name='right_middle_finger3',
+ id=123,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger3'),
+ 124:
+ dict(
+ name='right_middle_finger4',
+ id=124,
+ color=[102, 178, 255],
+ type='',
+ swap='left_middle_finger4'),
+ 125:
+ dict(
+ name='right_ring_finger1',
+ id=125,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger1'),
+ 126:
+ dict(
+ name='right_ring_finger2',
+ id=126,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger2'),
+ 127:
+ dict(
+ name='right_ring_finger3',
+ id=127,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger3'),
+ 128:
+ dict(
+ name='right_ring_finger4',
+ id=128,
+ color=[255, 51, 51],
+ type='',
+ swap='left_ring_finger4'),
+ 129:
+ dict(
+ name='right_pinky_finger1',
+ id=129,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger1'),
+ 130:
+ dict(
+ name='right_pinky_finger2',
+ id=130,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger2'),
+ 131:
+ dict(
+ name='right_pinky_finger3',
+ id=131,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger3'),
+ 132:
+ dict(
+ name='right_pinky_finger4',
+ id=132,
+ color=[0, 255, 0],
+ type='',
+ swap='left_pinky_finger4')
+ },
+ skeleton_info={
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+ 20:
+ dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+ 21:
+ dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=23,
+ color=[255, 128, 0]),
+ 24:
+ dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+ 0]),
+ 26:
+ dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+ 27:
+ dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+ 28:
+ dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+ 29:
+ dict(
+ link=('left_hand_root', 'left_forefinger1'),
+ id=29,
+ color=[255, 153, 255]),
+ 30:
+ dict(
+ link=('left_forefinger1', 'left_forefinger2'),
+ id=30,
+ color=[255, 153, 255]),
+ 31:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=31,
+ color=[255, 153, 255]),
+ 32:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=32,
+ color=[255, 153, 255]),
+ 33:
+ dict(
+ link=('left_hand_root', 'left_middle_finger1'),
+ id=33,
+ color=[102, 178, 255]),
+ 34:
+ dict(
+ link=('left_middle_finger1', 'left_middle_finger2'),
+ id=34,
+ color=[102, 178, 255]),
+ 35:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=35,
+ color=[102, 178, 255]),
+ 36:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=36,
+ color=[102, 178, 255]),
+ 37:
+ dict(
+ link=('left_hand_root', 'left_ring_finger1'),
+ id=37,
+ color=[255, 51, 51]),
+ 38:
+ dict(
+ link=('left_ring_finger1', 'left_ring_finger2'),
+ id=38,
+ color=[255, 51, 51]),
+ 39:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=39,
+ color=[255, 51, 51]),
+ 40:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=40,
+ color=[255, 51, 51]),
+ 41:
+ dict(
+ link=('left_hand_root', 'left_pinky_finger1'),
+ id=41,
+ color=[0, 255, 0]),
+ 42:
+ dict(
+ link=('left_pinky_finger1', 'left_pinky_finger2'),
+ id=42,
+ color=[0, 255, 0]),
+ 43:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=43,
+ color=[0, 255, 0]),
+ 44:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=44,
+ color=[0, 255, 0]),
+ 45:
+ dict(
+ link=('right_hand_root', 'right_thumb1'),
+ id=45,
+ color=[255, 128, 0]),
+ 46:
+ dict(
+ link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+ 47:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+ 48:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+ 49:
+ dict(
+ link=('right_hand_root', 'right_forefinger1'),
+ id=49,
+ color=[255, 153, 255]),
+ 50:
+ dict(
+ link=('right_forefinger1', 'right_forefinger2'),
+ id=50,
+ color=[255, 153, 255]),
+ 51:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=51,
+ color=[255, 153, 255]),
+ 52:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=52,
+ color=[255, 153, 255]),
+ 53:
+ dict(
+ link=('right_hand_root', 'right_middle_finger1'),
+ id=53,
+ color=[102, 178, 255]),
+ 54:
+ dict(
+ link=('right_middle_finger1', 'right_middle_finger2'),
+ id=54,
+ color=[102, 178, 255]),
+ 55:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=55,
+ color=[102, 178, 255]),
+ 56:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=56,
+ color=[102, 178, 255]),
+ 57:
+ dict(
+ link=('right_hand_root', 'right_ring_finger1'),
+ id=57,
+ color=[255, 51, 51]),
+ 58:
+ dict(
+ link=('right_ring_finger1', 'right_ring_finger2'),
+ id=58,
+ color=[255, 51, 51]),
+ 59:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=59,
+ color=[255, 51, 51]),
+ 60:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=60,
+ color=[255, 51, 51]),
+ 61:
+ dict(
+ link=('right_hand_root', 'right_pinky_finger1'),
+ id=61,
+ color=[0, 255, 0]),
+ 62:
+ dict(
+ link=('right_pinky_finger1', 'right_pinky_finger2'),
+ id=62,
+ color=[0, 255, 0]),
+ 63:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=63,
+ color=[0, 255, 0]),
+ 64:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=64,
+ color=[0, 255, 0])
+ },
+ joint_weights=[1.] * 133,
+ # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+ # 'evaluation/myeval_wholebody.py#L175'
+ sigmas=[
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+ 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+ 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+ 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+ 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+ 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+ 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+ 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+ 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+ 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+ 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+ 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+ 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+ 0.019, 0.022, 0.031
+ ])
diff --git a/modules/rtmpose/configs/_base_/datasets/ubody3d.py b/modules/rtmpose/configs/_base_/datasets/ubody3d.py
new file mode 100644
index 0000000..313b7df
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/ubody3d.py
@@ -0,0 +1,958 @@
+dataset_info = dict(
+ dataset_name='ubody3d',
+ paper_info=dict(
+ author='Jing Lin, Ailing Zeng, Haoqian Wang, Lei Zhang, Yu Li',
+ title='One-Stage 3D Whole-Body Mesh Recovery with Component Aware'
+ 'Transformer',
+ container='IEEE Computer Society Conference on Computer Vision and '
+ 'Pattern Recognition (CVPR)',
+ year='2023',
+ homepage='https://github.com/IDEA-Research/OSX',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='Pelvis', id=0, color=[0, 255, 0], type='', swap=''),
+ 1:
+ dict(
+ name='L_Hip', id=1, color=[0, 255, 0], type='lower', swap='R_Hip'),
+ 2:
+ dict(
+ name='R_Hip', id=2, color=[0, 255, 0], type='lower', swap='L_Hip'),
+ 3:
+ dict(
+ name='L_Knee',
+ id=3,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_Knee'),
+ 4:
+ dict(
+ name='R_Knee',
+ id=4,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Knee'),
+ 5:
+ dict(
+ name='L_Ankle',
+ id=5,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_Ankle'),
+ 6:
+ dict(
+ name='R_Ankle',
+ id=6,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Ankle'),
+ 7:
+ dict(name='Neck', id=7, color=[0, 255, 0], type='upper', swap=''),
+ 8:
+ dict(
+ name='L_Shoulder',
+ id=8,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_Shoulder'),
+ 9:
+ dict(
+ name='R_Shoulder',
+ id=9,
+ color=[0, 255, 0],
+ type='upper',
+ swap='L_Shoulder'),
+ 10:
+ dict(
+ name='L_Elbow',
+ id=10,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_Elbow'),
+ 11:
+ dict(
+ name='R_Elbow',
+ id=11,
+ color=[0, 255, 0],
+ type='upper',
+ swap='L_Elbow'),
+ 12:
+ dict(
+ name='L_Wrist',
+ id=12,
+ color=[0, 255, 0],
+ type='upper',
+ swap='R_Wrist'),
+ 13:
+ dict(
+ name='R_Wrist',
+ id=13,
+ color=[0, 255, 0],
+ type='upper',
+ swap='L_Wrist'),
+ 14:
+ dict(
+ name='L_Big_toe',
+ id=14,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_Big_toe'),
+ 15:
+ dict(
+ name='L_Small_toe',
+ id=15,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_Small_toe'),
+ 16:
+ dict(
+ name='L_Heel',
+ id=16,
+ color=[0, 255, 0],
+ type='lower',
+ swap='R_Heel'),
+ 17:
+ dict(
+ name='R_Big_toe',
+ id=17,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Big_toe'),
+ 18:
+ dict(
+ name='R_Small_toe',
+ id=18,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Small_toe'),
+ 19:
+ dict(
+ name='R_Heel',
+ id=19,
+ color=[0, 255, 0],
+ type='lower',
+ swap='L_Heel'),
+ 20:
+ dict(
+ name='L_Ear', id=20, color=[0, 255, 0], type='upper',
+ swap='R_Ear'),
+ 21:
+ dict(
+ name='R_Ear', id=21, color=[0, 255, 0], type='upper',
+ swap='L_Ear'),
+ 22:
+ dict(name='L_Eye', id=22, color=[0, 255, 0], type='', swap='R_Eye'),
+ 23:
+ dict(name='R_Eye', id=23, color=[0, 255, 0], type='', swap='L_Eye'),
+ 24:
+ dict(name='Nose', id=24, color=[0, 255, 0], type='upper', swap=''),
+ 25:
+ dict(
+ name='L_Thumb_1',
+ id=25,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Thumb_1'),
+ 26:
+ dict(
+ name='L_Thumb_2',
+ id=26,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Thumb_2'),
+ 27:
+ dict(
+ name='L_Thumb_3',
+ id=27,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Thumb_3'),
+ 28:
+ dict(
+ name='L_Thumb_4',
+ id=28,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Thumb_4'),
+ 29:
+ dict(
+ name='L_Index_1',
+ id=29,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Index_1'),
+ 30:
+ dict(
+ name='L_Index_2',
+ id=30,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Index_2'),
+ 31:
+ dict(
+ name='L_Index_3',
+ id=31,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Index_3'),
+ 32:
+ dict(
+ name='L_Index_4',
+ id=32,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Index_4'),
+ 33:
+ dict(
+ name='L_Middle_1',
+ id=33,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Middle_1'),
+ 34:
+ dict(
+ name='L_Middle_2',
+ id=34,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Middle_2'),
+ 35:
+ dict(
+ name='L_Middle_3',
+ id=35,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Middle_3'),
+ 36:
+ dict(
+ name='L_Middle_4',
+ id=36,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Middle_4'),
+ 37:
+ dict(
+ name='L_Ring_1',
+ id=37,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Ring_1'),
+ 38:
+ dict(
+ name='L_Ring_2',
+ id=38,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Ring_2'),
+ 39:
+ dict(
+ name='L_Ring_3',
+ id=39,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Ring_3'),
+ 40:
+ dict(
+ name='L_Ring_4',
+ id=40,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Ring_4'),
+ 41:
+ dict(
+ name='L_Pinky_1',
+ id=41,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Pinky_1'),
+ 42:
+ dict(
+ name='L_Pinky_2',
+ id=42,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Pinky_2'),
+ 43:
+ dict(
+ name='L_Pinky_3',
+ id=43,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Pinky_3'),
+ 44:
+ dict(
+ name='L_Pinky_4',
+ id=44,
+ color=[255, 128, 0],
+ type='',
+ swap='R_Pinky_4'),
+ 45:
+ dict(
+ name='R_Thumb_1',
+ id=45,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Thumb_1'),
+ 46:
+ dict(
+ name='R_Thumb_2',
+ id=46,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Thumb_2'),
+ 47:
+ dict(
+ name='R_Thumb_3',
+ id=47,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Thumb_3'),
+ 48:
+ dict(
+ name='R_Thumb_4',
+ id=48,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Thumb_4'),
+ 49:
+ dict(
+ name='R_Index_1',
+ id=49,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Index_1'),
+ 50:
+ dict(
+ name='R_Index_2',
+ id=50,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Index_2'),
+ 51:
+ dict(
+ name='R_Index_3',
+ id=51,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Index_3'),
+ 52:
+ dict(
+ name='R_Index_4',
+ id=52,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Index_4'),
+ 53:
+ dict(
+ name='R_Middle_1',
+ id=53,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Middle_1'),
+ 54:
+ dict(
+ name='R_Middle_2',
+ id=54,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Middle_2'),
+ 55:
+ dict(
+ name='R_Middle_3',
+ id=55,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Middle_3'),
+ 56:
+ dict(
+ name='R_Middle_4',
+ id=56,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Middle_4'),
+ 57:
+ dict(
+ name='R_Ring_1',
+ id=57,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Ring_1'),
+ 58:
+ dict(
+ name='R_Ring_2',
+ id=58,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Ring_2'),
+ 59:
+ dict(
+ name='R_Ring_3',
+ id=59,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Ring_3'),
+ 60:
+ dict(
+ name='R_Ring_4',
+ id=60,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Ring_4'),
+ 61:
+ dict(
+ name='R_Pinky_1',
+ id=61,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Pinky_1'),
+ 62:
+ dict(
+ name='R_Pinky_2',
+ id=62,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Pinky_2'),
+ 63:
+ dict(
+ name='R_Pinky_3',
+ id=63,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Pinky_3'),
+ 64:
+ dict(
+ name='R_Pinky_4',
+ id=64,
+ color=[255, 128, 0],
+ type='',
+ swap='L_Pinky_4'),
+ 65:
+ dict(name='Face_1', id=65, color=[255, 255, 255], type='', swap=''),
+ 66:
+ dict(name='Face_2', id=66, color=[255, 255, 255], type='', swap=''),
+ 67:
+ dict(
+ name='Face_3',
+ id=67,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_4'),
+ 68:
+ dict(
+ name='Face_4',
+ id=68,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_3'),
+ 69:
+ dict(
+ name='Face_5',
+ id=69,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_14'),
+ 70:
+ dict(
+ name='Face_6',
+ id=70,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_13'),
+ 71:
+ dict(
+ name='Face_7',
+ id=71,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_12'),
+ 72:
+ dict(
+ name='Face_8',
+ id=72,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_11'),
+ 73:
+ dict(
+ name='Face_9',
+ id=73,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_10'),
+ 74:
+ dict(
+ name='Face_10',
+ id=74,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_9'),
+ 75:
+ dict(
+ name='Face_11',
+ id=75,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_8'),
+ 76:
+ dict(
+ name='Face_12',
+ id=76,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_7'),
+ 77:
+ dict(
+ name='Face_13',
+ id=77,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_6'),
+ 78:
+ dict(
+ name='Face_14',
+ id=78,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_5'),
+ 79:
+ dict(name='Face_15', id=79, color=[255, 255, 255], type='', swap=''),
+ 80:
+ dict(name='Face_16', id=80, color=[255, 255, 255], type='', swap=''),
+ 81:
+ dict(name='Face_17', id=81, color=[255, 255, 255], type='', swap=''),
+ 82:
+ dict(name='Face_18', id=82, color=[255, 255, 255], type='', swap=''),
+ 83:
+ dict(
+ name='Face_19',
+ id=83,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_23'),
+ 84:
+ dict(
+ name='Face_20',
+ id=84,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_22'),
+ 85:
+ dict(name='Face_21', id=85, color=[255, 255, 255], type='', swap=''),
+ 86:
+ dict(
+ name='Face_22',
+ id=86,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_20'),
+ 87:
+ dict(
+ name='Face_23',
+ id=87,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_19'),
+ 88:
+ dict(
+ name='Face_24',
+ id=88,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_33'),
+ 89:
+ dict(
+ name='Face_25',
+ id=89,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_32'),
+ 90:
+ dict(
+ name='Face_26',
+ id=90,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_31'),
+ 91:
+ dict(
+ name='Face_27',
+ id=91,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_30'),
+ 92:
+ dict(
+ name='Face_28',
+ id=92,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_35'),
+ 93:
+ dict(
+ name='Face_29',
+ id=93,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_34'),
+ 94:
+ dict(
+ name='Face_30',
+ id=94,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_27'),
+ 95:
+ dict(
+ name='Face_31',
+ id=95,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_26'),
+ 96:
+ dict(
+ name='Face_32',
+ id=96,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_25'),
+ 97:
+ dict(
+ name='Face_33',
+ id=97,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_24'),
+ 98:
+ dict(
+ name='Face_34',
+ id=98,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_29'),
+ 99:
+ dict(
+ name='Face_35',
+ id=99,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_28'),
+ 100:
+ dict(
+ name='Face_36',
+ id=100,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_42'),
+ 101:
+ dict(
+ name='Face_37',
+ id=101,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_41'),
+ 102:
+ dict(
+ name='Face_38',
+ id=102,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_40'),
+ 103:
+ dict(name='Face_39', id=103, color=[255, 255, 255], type='', swap=''),
+ 104:
+ dict(
+ name='Face_40',
+ id=104,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_38'),
+ 105:
+ dict(
+ name='Face_41',
+ id=105,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_37'),
+ 106:
+ dict(
+ name='Face_42',
+ id=106,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_36'),
+ 107:
+ dict(
+ name='Face_43',
+ id=107,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_47'),
+ 108:
+ dict(
+ name='Face_44',
+ id=108,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_46'),
+ 109:
+ dict(name='Face_45', id=109, color=[255, 255, 255], type='', swap=''),
+ 110:
+ dict(
+ name='Face_46',
+ id=110,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_44'),
+ 111:
+ dict(
+ name='Face_47',
+ id=111,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_43'),
+ 112:
+ dict(
+ name='Face_48',
+ id=112,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_52'),
+ 113:
+ dict(
+ name='Face_49',
+ id=113,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_51'),
+ 114:
+ dict(name='Face_50', id=114, color=[255, 255, 255], type='', swap=''),
+ 115:
+ dict(
+ name='Face_51',
+ id=115,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_49'),
+ 116:
+ dict(
+ name='Face_52',
+ id=116,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_48'),
+ 117:
+ dict(
+ name='Face_53',
+ id=117,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_55'),
+ 118:
+ dict(name='Face_54', id=118, color=[255, 255, 255], type='', swap=''),
+ 119:
+ dict(
+ name='Face_55',
+ id=119,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_53'),
+ 120:
+ dict(
+ name='Face_56',
+ id=120,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_72'),
+ 121:
+ dict(
+ name='Face_57',
+ id=121,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_71'),
+ 122:
+ dict(
+ name='Face_58',
+ id=122,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_70'),
+ 123:
+ dict(
+ name='Face_59',
+ id=123,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_69'),
+ 124:
+ dict(
+ name='Face_60',
+ id=124,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_68'),
+ 125:
+ dict(
+ name='Face_61',
+ id=125,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_67'),
+ 126:
+ dict(
+ name='Face_62',
+ id=126,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_66'),
+ 127:
+ dict(
+ name='Face_63',
+ id=127,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_65'),
+ 128:
+ dict(name='Face_64', id=128, color=[255, 255, 255], type='', swap=''),
+ 129:
+ dict(
+ name='Face_65',
+ id=129,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_63'),
+ 130:
+ dict(
+ name='Face_66',
+ id=130,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_62'),
+ 131:
+ dict(
+ name='Face_67',
+ id=131,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_61'),
+ 132:
+ dict(
+ name='Face_68',
+ id=132,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_60'),
+ 133:
+ dict(
+ name='Face_69',
+ id=133,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_59'),
+ 134:
+ dict(
+ name='Face_70',
+ id=134,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_58'),
+ 135:
+ dict(
+ name='Face_71',
+ id=135,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_57'),
+ 136:
+ dict(
+ name='Face_72',
+ id=136,
+ color=[255, 255, 255],
+ type='',
+ swap='Face_56'),
+ },
+ skeleton_info={
+ 0: dict(link=('L_Ankle', 'L_Knee'), id=0, color=[0, 255, 0]),
+ 1: dict(link=('L_Knee', 'L_Hip'), id=1, color=[0, 255, 0]),
+ 2: dict(link=('R_Ankle', 'R_Knee'), id=2, color=[0, 255, 0]),
+ 3: dict(link=('R_Knee', 'R_Hip'), id=3, color=[0, 255, 0]),
+ 4: dict(link=('L_Hip', 'R_Hip'), id=4, color=[0, 255, 0]),
+ 5: dict(link=('L_Shoulder', 'L_Hip'), id=5, color=[0, 255, 0]),
+ 6: dict(link=('R_Shoulder', 'R_Hip'), id=6, color=[0, 255, 0]),
+ 7: dict(link=('L_Shoulder', 'R_Shoulder'), id=7, color=[0, 255, 0]),
+ 8: dict(link=('L_Shoulder', 'L_Elbow'), id=8, color=[0, 255, 0]),
+ 9: dict(link=('R_Shoulder', 'R_Elbow'), id=9, color=[0, 255, 0]),
+ 10: dict(link=('L_Elbow', 'L_Wrist'), id=10, color=[0, 255, 0]),
+ 11: dict(link=('R_Elbow', 'R_Wrist'), id=11, color=[255, 128, 0]),
+ 12: dict(link=('L_Eye', 'R_Eye'), id=12, color=[255, 128, 0]),
+ 13: dict(link=('Nose', 'L_Eye'), id=13, color=[255, 128, 0]),
+ 14: dict(link=('Nose', 'R_Eye'), id=14, color=[255, 128, 0]),
+ 15: dict(link=('L_Eye', 'L_Ear'), id=15, color=[255, 128, 0]),
+ 16: dict(link=('R_Eye', 'R_Ear'), id=16, color=[255, 128, 0]),
+ 17: dict(link=('L_Ear', 'L_Shoulder'), id=17, color=[255, 128, 0]),
+ 18: dict(link=('R_Ear', 'R_Shoulder'), id=18, color=[255, 128, 0]),
+ 19: dict(link=('L_Ankle', 'L_Big_toe'), id=19, color=[255, 128, 0]),
+ 20: dict(link=('L_Ankle', 'L_Small_toe'), id=20, color=[255, 128, 0]),
+ 21: dict(link=('L_Ankle', 'L_Heel'), id=21, color=[255, 128, 0]),
+ 22: dict(link=('R_Ankle', 'R_Big_toe'), id=22, color=[255, 128, 0]),
+ 23: dict(link=('R_Ankle', 'R_Small_toe'), id=23, color=[255, 128, 0]),
+ 24: dict(link=('R_Ankle', 'R_Heel'), id=24, color=[255, 128, 0]),
+ 25: dict(link=('L_Wrist', 'L_Thumb_1'), id=25, color=[255, 128, 0]),
+ 26: dict(link=('L_Thumb_1', 'L_Thumb_2'), id=26, color=[255, 128, 0]),
+ 27: dict(link=('L_Thumb_2', 'L_Thumb_3'), id=27, color=[255, 128, 0]),
+ 28: dict(link=('L_Thumb_3', 'L_Thumb_4'), id=28, color=[255, 128, 0]),
+ 29: dict(link=('L_Wrist', 'L_Index_1'), id=29, color=[255, 128, 0]),
+ 30: dict(link=('L_Index_1', 'L_Index_2'), id=30, color=[255, 128, 0]),
+ 31:
+ dict(link=('L_Index_2', 'L_Index_3'), id=31, color=[255, 255, 255]),
+ 32:
+ dict(link=('L_Index_3', 'L_Index_4'), id=32, color=[255, 255, 255]),
+ 33: dict(link=('L_Wrist', 'L_Middle_1'), id=33, color=[255, 255, 255]),
+ 34:
+ dict(link=('L_Middle_1', 'L_Middle_2'), id=34, color=[255, 255, 255]),
+ 35:
+ dict(link=('L_Middle_2', 'L_Middle_3'), id=35, color=[255, 255, 255]),
+ 36:
+ dict(link=('L_Middle_3', 'L_Middle_4'), id=36, color=[255, 255, 255]),
+ 37: dict(link=('L_Wrist', 'L_Ring_1'), id=37, color=[255, 255, 255]),
+ 38: dict(link=('L_Ring_1', 'L_Ring_2'), id=38, color=[255, 255, 255]),
+ 39: dict(link=('L_Ring_2', 'L_Ring_3'), id=39, color=[255, 255, 255]),
+ 40: dict(link=('L_Ring_3', 'L_Ring_4'), id=40, color=[255, 255, 255]),
+ 41: dict(link=('L_Wrist', 'L_Pinky_1'), id=41, color=[255, 255, 255]),
+ 42:
+ dict(link=('L_Pinky_1', 'L_Pinky_2'), id=42, color=[255, 255, 255]),
+ 43:
+ dict(link=('L_Pinky_2', 'L_Pinky_3'), id=43, color=[255, 255, 255]),
+ 44:
+ dict(link=('L_Pinky_3', 'L_Pinky_4'), id=44, color=[255, 255, 255]),
+ 45: dict(link=('R_Wrist', 'R_Thumb_1'), id=45, color=[255, 255, 255]),
+ 46:
+ dict(link=('R_Thumb_1', 'R_Thumb_2'), id=46, color=[255, 255, 255]),
+ 47:
+ dict(link=('R_Thumb_2', 'R_Thumb_3'), id=47, color=[255, 255, 255]),
+ 48:
+ dict(link=('R_Thumb_3', 'R_Thumb_4'), id=48, color=[255, 255, 255]),
+ 49: dict(link=('R_Wrist', 'R_Index_1'), id=49, color=[255, 255, 255]),
+ 50:
+ dict(link=('R_Index_1', 'R_Index_2'), id=50, color=[255, 255, 255]),
+ 51:
+ dict(link=('R_Index_2', 'R_Index_3'), id=51, color=[255, 255, 255]),
+ 52:
+ dict(link=('R_Index_3', 'R_Index_4'), id=52, color=[255, 255, 255]),
+ 53: dict(link=('R_Wrist', 'R_Middle_1'), id=53, color=[255, 255, 255]),
+ 54:
+ dict(link=('R_Middle_1', 'R_Middle_2'), id=54, color=[255, 255, 255]),
+ 55:
+ dict(link=('R_Middle_2', 'R_Middle_3'), id=55, color=[255, 255, 255]),
+ 56:
+ dict(link=('R_Middle_3', 'R_Middle_4'), id=56, color=[255, 255, 255]),
+ 57: dict(link=('R_Wrist', 'R_Pinky_1'), id=57, color=[255, 255, 255]),
+ 58:
+ dict(link=('R_Pinky_1', 'R_Pinky_2'), id=58, color=[255, 255, 255]),
+ 59:
+ dict(link=('R_Pinky_2', 'R_Pinky_3'), id=59, color=[255, 255, 255]),
+ 60:
+ dict(link=('R_Pinky_3', 'R_Pinky_4'), id=60, color=[255, 255, 255]),
+ },
+ joint_weights=[1.] * 137,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/wflw.py b/modules/rtmpose/configs/_base_/datasets/wflw.py
new file mode 100644
index 0000000..c31750b
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/wflw.py
@@ -0,0 +1,192 @@
+dataset_info = dict(
+ dataset_name='wflw',
+ paper_info=dict(
+ author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, '
+ 'Quan and Cai, Yici and Zhou, Qiang',
+ title='Look at boundary: A boundary-aware face alignment algorithm',
+ container='Proceedings of the IEEE conference on computer '
+ 'vision and pattern recognition',
+ year='2018',
+ homepage='https://wywu.github.io/projects/LAB/WFLW.html',
+ ),
+ keypoint_info={
+ 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-32'),
+ 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-31'),
+ 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-30'),
+ 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-29'),
+ 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-28'),
+ 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-27'),
+ 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-26'),
+ 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-25'),
+ 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-24'),
+ 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-23'),
+ 10:
+ dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-22'),
+ 11:
+ dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-21'),
+ 12:
+ dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-20'),
+ 13:
+ dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-19'),
+ 14:
+ dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-18'),
+ 15:
+ dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'),
+ 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''),
+ 17:
+ dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'),
+ 18:
+ dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-14'),
+ 19:
+ dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-13'),
+ 20:
+ dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-12'),
+ 21:
+ dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-11'),
+ 22:
+ dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-10'),
+ 23:
+ dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-9'),
+ 24:
+ dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-8'),
+ 25:
+ dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-7'),
+ 26:
+ dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-6'),
+ 27:
+ dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap='kpt-5'),
+ 28:
+ dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='kpt-4'),
+ 29:
+ dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap='kpt-3'),
+ 30:
+ dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap='kpt-2'),
+ 31:
+ dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-1'),
+ 32:
+ dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-0'),
+ 33:
+ dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap='kpt-46'),
+ 34:
+ dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-45'),
+ 35:
+ dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-44'),
+ 36:
+ dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-43'),
+ 37: dict(
+ name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-42'),
+ 38: dict(
+ name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-50'),
+ 39: dict(
+ name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-49'),
+ 40: dict(
+ name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-48'),
+ 41: dict(
+ name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-47'),
+ 42: dict(
+ name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-37'),
+ 43: dict(
+ name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-36'),
+ 44: dict(
+ name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-35'),
+ 45: dict(
+ name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-34'),
+ 46: dict(
+ name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-33'),
+ 47: dict(
+ name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-41'),
+ 48: dict(
+ name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-40'),
+ 49: dict(
+ name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-39'),
+ 50: dict(
+ name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-38'),
+ 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''),
+ 52: dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''),
+ 53: dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''),
+ 54: dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''),
+ 55: dict(
+ name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'),
+ 56: dict(
+ name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'),
+ 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''),
+ 58: dict(
+ name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'),
+ 59: dict(
+ name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'),
+ 60: dict(
+ name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-72'),
+ 61: dict(
+ name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-71'),
+ 62: dict(
+ name='kpt-62', id=62, color=[255, 0, 0], type='', swap='kpt-70'),
+ 63: dict(
+ name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-69'),
+ 64: dict(
+ name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-68'),
+ 65: dict(
+ name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-75'),
+ 66: dict(
+ name='kpt-66', id=66, color=[255, 0, 0], type='', swap='kpt-74'),
+ 67: dict(
+ name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-73'),
+ 68: dict(
+ name='kpt-68', id=68, color=[255, 0, 0], type='', swap='kpt-64'),
+ 69: dict(
+ name='kpt-69', id=69, color=[255, 0, 0], type='', swap='kpt-63'),
+ 70: dict(
+ name='kpt-70', id=70, color=[255, 0, 0], type='', swap='kpt-62'),
+ 71: dict(
+ name='kpt-71', id=71, color=[255, 0, 0], type='', swap='kpt-61'),
+ 72: dict(
+ name='kpt-72', id=72, color=[255, 0, 0], type='', swap='kpt-60'),
+ 73: dict(
+ name='kpt-73', id=73, color=[255, 0, 0], type='', swap='kpt-67'),
+ 74: dict(
+ name='kpt-74', id=74, color=[255, 0, 0], type='', swap='kpt-66'),
+ 75: dict(
+ name='kpt-75', id=75, color=[255, 0, 0], type='', swap='kpt-65'),
+ 76: dict(
+ name='kpt-76', id=76, color=[255, 0, 0], type='', swap='kpt-82'),
+ 77: dict(
+ name='kpt-77', id=77, color=[255, 0, 0], type='', swap='kpt-81'),
+ 78: dict(
+ name='kpt-78', id=78, color=[255, 0, 0], type='', swap='kpt-80'),
+ 79: dict(name='kpt-79', id=79, color=[255, 0, 0], type='', swap=''),
+ 80: dict(
+ name='kpt-80', id=80, color=[255, 0, 0], type='', swap='kpt-78'),
+ 81: dict(
+ name='kpt-81', id=81, color=[255, 0, 0], type='', swap='kpt-77'),
+ 82: dict(
+ name='kpt-82', id=82, color=[255, 0, 0], type='', swap='kpt-76'),
+ 83: dict(
+ name='kpt-83', id=83, color=[255, 0, 0], type='', swap='kpt-87'),
+ 84: dict(
+ name='kpt-84', id=84, color=[255, 0, 0], type='', swap='kpt-86'),
+ 85: dict(name='kpt-85', id=85, color=[255, 0, 0], type='', swap=''),
+ 86: dict(
+ name='kpt-86', id=86, color=[255, 0, 0], type='', swap='kpt-84'),
+ 87: dict(
+ name='kpt-87', id=87, color=[255, 0, 0], type='', swap='kpt-83'),
+ 88: dict(
+ name='kpt-88', id=88, color=[255, 0, 0], type='', swap='kpt-92'),
+ 89: dict(
+ name='kpt-89', id=89, color=[255, 0, 0], type='', swap='kpt-91'),
+ 90: dict(name='kpt-90', id=90, color=[255, 0, 0], type='', swap=''),
+ 91: dict(
+ name='kpt-91', id=91, color=[255, 0, 0], type='', swap='kpt-89'),
+ 92: dict(
+ name='kpt-92', id=92, color=[255, 0, 0], type='', swap='kpt-88'),
+ 93: dict(
+ name='kpt-93', id=93, color=[255, 0, 0], type='', swap='kpt-95'),
+ 94: dict(name='kpt-94', id=94, color=[255, 0, 0], type='', swap=''),
+ 95: dict(
+ name='kpt-95', id=95, color=[255, 0, 0], type='', swap='kpt-93'),
+ 96: dict(
+ name='kpt-96', id=96, color=[255, 0, 0], type='', swap='kpt-97'),
+ 97: dict(
+ name='kpt-97', id=97, color=[255, 0, 0], type='', swap='kpt-96')
+ },
+ skeleton_info={},
+ joint_weights=[1.] * 98,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/datasets/zebra.py b/modules/rtmpose/configs/_base_/datasets/zebra.py
new file mode 100644
index 0000000..bc4f9ec
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/datasets/zebra.py
@@ -0,0 +1,64 @@
+dataset_info = dict(
+ dataset_name='zebra',
+ paper_info=dict(
+ author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+ 'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+ 'Couzin, Iain D',
+ title='DeepPoseKit, a software toolkit for fast and robust '
+ 'animal pose estimation using deep learning',
+ container='Elife',
+ year='2019',
+ homepage='https://github.com/jgraving/DeepPoseKit-Data',
+ ),
+ keypoint_info={
+ 0:
+ dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''),
+ 1:
+ dict(name='head', id=1, color=[255, 255, 255], type='', swap=''),
+ 2:
+ dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''),
+ 3:
+ dict(
+ name='forelegL1',
+ id=3,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegR1'),
+ 4:
+ dict(
+ name='forelegR1',
+ id=4,
+ color=[255, 255, 255],
+ type='',
+ swap='forelegL1'),
+ 5:
+ dict(
+ name='hindlegL1',
+ id=5,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegR1'),
+ 6:
+ dict(
+ name='hindlegR1',
+ id=6,
+ color=[255, 255, 255],
+ type='',
+ swap='hindlegL1'),
+ 7:
+ dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''),
+ 8:
+ dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='')
+ },
+ skeleton_info={
+ 0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]),
+ 1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]),
+ 2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]),
+ 3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]),
+ 4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]),
+ 5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]),
+ 6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]),
+ 7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255])
+ },
+ joint_weights=[1.] * 9,
+ sigmas=[])
diff --git a/modules/rtmpose/configs/_base_/default_runtime.py b/modules/rtmpose/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..c83ae93
--- /dev/null
+++ b/modules/rtmpose/configs/_base_/default_runtime.py
@@ -0,0 +1,54 @@
+default_scope = 'mmpose'
+
+# hooks
+default_hooks = dict(
+ timer=dict(type='IterTimerHook'),
+ logger=dict(type='LoggerHook', interval=50),
+ param_scheduler=dict(type='ParamSchedulerHook'),
+ checkpoint=dict(type='CheckpointHook', interval=10),
+ sampler_seed=dict(type='DistSamplerSeedHook'),
+ visualization=dict(type='PoseVisualizationHook', enable=False),
+ badcase=dict(
+ type='BadCaseAnalysisHook',
+ enable=False,
+ out_dir='badcase',
+ metric_type='loss',
+ badcase_thr=5))
+
+# custom hooks
+custom_hooks = [
+ # Synchronize model buffers such as running_mean and running_var in BN
+ # at the end of each epoch
+ dict(type='SyncBuffersHook')
+]
+
+# multi-processing backend
+env_cfg = dict(
+ cudnn_benchmark=False,
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+ dist_cfg=dict(backend='nccl'),
+)
+
+# visualizer
+vis_backends = [
+ dict(type='LocalVisBackend'),
+ # dict(type='TensorboardVisBackend'),
+ # dict(type='WandbVisBackend'),
+]
+visualizer = dict(
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# logger
+log_processor = dict(
+ type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# file I/O backend
+backend_args = dict(backend='local')
+
+# training/validation/testing progress
+train_cfg = dict(by_epoch=True)
+val_cfg = dict()
+test_cfg = dict()
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/README.md b/modules/rtmpose/configs/animal_2d_keypoint/README.md
new file mode 100644
index 0000000..1ee3b40
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/README.md
@@ -0,0 +1,20 @@
+# 2D Animal Keypoint Detection
+
+2D animal keypoint detection (animal pose estimation) aims to detect the key-point of different species, including rats,
+dogs, macaques, and cheetah. It provides detailed behavioral analysis for neuroscience, medical and ecology applications.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_animal_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [DEMO](/demo/docs/en/2d_animal_demo.md) to generate fancy demos.
+
+
+
+
+
+
+
+
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md
new file mode 100644
index 0000000..b722d40
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/README.md
@@ -0,0 +1,16 @@
+# RTMPose
+
+Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
+In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
+Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
+To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
+
+## Results and Models
+
+### AP-10K Dataset
+
+Results on AP-10K validation set
+
+| Model | Input Size | AP | Details and Download |
+| :-------: | :--------: | :---: | :------------------------------------------: |
+| RTMPose-m | 256x256 | 0.722 | [rtmpose_cp10k.md](./ap10k/rtmpose_ap10k.md) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..576b71f
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,245 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/',
+# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md
new file mode 100644
index 0000000..e72cefb
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md
@@ -0,0 +1,25 @@
+
+
+
+
+
+AP-10K (NeurIPS'2021)
+
+```bibtex
+@misc{yu2021ap10k,
+ title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+ author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+ year={2021},
+ eprint={2108.12617},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+Results on AP-10K validation set
+
+| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log |
+| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: |
+| [rtmpose-m](/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.722 | 0.939 | 0.788 | 0.569 | 0.728 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml
new file mode 100644
index 0000000..1f12576
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py
+ In Collection: RTMPose
+ Alias: animal
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: AP-10K
+ Name: rtmpose-m_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.722
+ AP@0.5: 0.939
+ AP@0.75: 0.788
+ AP (L): 0.728
+ AP (M): 0.569
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..bf13310
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,68 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the
+likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### Animal-Pose Dataset
+
+Results on AnimalPose validation set (1117 instances)
+
+| Model | Input Size | AP | AR | Details and Download |
+| :--------: | :--------: | :---: | :---: | :-------------------------------------------------------: |
+| HRNet-w32 | 256x256 | 0.740 | 0.780 | [hrnet_animalpose.md](./animalpose/hrnet_animalpose.md) |
+| HRNet-w48 | 256x256 | 0.738 | 0.778 | [hrnet_animalpose.md](./animalpose/hrnet_animalpose.md) |
+| ResNet-152 | 256x256 | 0.704 | 0.748 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) |
+| ResNet-101 | 256x256 | 0.696 | 0.736 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) |
+| ResNet-50 | 256x256 | 0.691 | 0.736 | [resnet_animalpose.md](./animalpose/resnet_animalpose.md) |
+
+### AP-10K Dataset
+
+Results on AP-10K validation set
+
+| Model | Input Size | AP | Details and Download |
+| :--------: | :--------: | :---: | :--------------------------------------------------: |
+| HRNet-w48 | 256x256 | 0.728 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) |
+| HRNet-w32 | 256x256 | 0.722 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) |
+| ResNet-101 | 256x256 | 0.681 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) |
+| ResNet-50 | 256x256 | 0.680 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) |
+| CSPNeXt-m | 256x256 | 0.703 | [cspnext_udp_ap10k.md](./ap10k/cspnext_udp_ap10k.md) |
+
+### Desert Locust Dataset
+
+Results on Desert Locust test set
+
+| Model | Input Size | AUC | EPE | Details and Download |
+| :--------: | :--------: | :---: | :--: | :-------------------------------------------: |
+| ResNet-152 | 160x160 | 0.925 | 1.49 | [resnet_locust.md](./locust/resnet_locust.md) |
+| ResNet-101 | 160x160 | 0.907 | 2.03 | [resnet_locust.md](./locust/resnet_locust.md) |
+| ResNet-50 | 160x160 | 0.900 | 2.27 | [resnet_locust.md](./locust/resnet_locust.md) |
+
+### Grévy’s Zebra Dataset
+
+Results on Grévy’s Zebra test set
+
+| Model | Input Size | AUC | EPE | Details and Download |
+| :--------: | :--------: | :---: | :--: | :----------------------------------------: |
+| ResNet-152 | 160x160 | 0.921 | 1.67 | [resnet_zebra.md](./zebra/resnet_zebra.md) |
+| ResNet-101 | 160x160 | 0.915 | 1.83 | [resnet_zebra.md](./zebra/resnet_zebra.md) |
+| ResNet-50 | 160x160 | 0.914 | 1.87 | [resnet_zebra.md](./zebra/resnet_zebra.md) |
+
+### Animal-Kingdom Dataset
+
+Results on AnimalKingdom test set
+
+| Model | Input Size | class | PCK(0.05) | Details and Download |
+| :-------: | :--------: | :-----------: | :-------: | :---------------------------------------------------: |
+| HRNet-w32 | 256x256 | P1 | 0.6323 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P2 | 0.3741 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P3_mammals | 0.571 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P3_amphibians | 0.5358 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P3_reptiles | 0.51 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P3_birds | 0.7671 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
+| HRNet-w32 | 256x256 | P3_fishes | 0.6406 | [hrnet_animalkingdom.md](./ak/hrnet_animalkingdom.md) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md
new file mode 100644
index 0000000..a2ba7cb
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.md
@@ -0,0 +1,47 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+AnimalKingdom (CVPR'2022)
+
+```bibtex
+@InProceedings{
+ Ng_2022_CVPR,
+ author = {Ng, Xun Long and Ong, Kian Eng and Zheng, Qichen and Ni, Yun and Yeo, Si Yong and Liu, Jun},
+ title = {Animal Kingdom: A Large and Diverse Dataset for Animal Behavior Understanding},
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2022},
+ pages = {19023-19034}
+ }
+```
+
+
+
+Results on AnimalKingdom validation set
+
+| Arch | Input Size | PCK(0.05) | Official Repo | Paper | ckpt | log |
+| ------------------------------------------------------ | ---------- | --------- | ------------- | ------ | ------------------------------------------------------ | ------------------------------------------------------ |
+| [P1_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py) | 256x256 | 0.6323 | 0.6342 | 0.6606 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.json) |
+| [P2_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py) | 256x256 | 0.3741 | 0.3726 | 0.393 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.json) |
+| [P3_mammals_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py) | 256x256 | 0.571 | 0.5719 | 0.6159 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.json) |
+| [P3_amphibians_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py) | 256x256 | 0.5358 | 0.5432 | 0.5674 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.json) |
+| [P3_reptiles_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py) | 256x256 | 0.51 | 0.5 | 0.5606 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.json) |
+| [P3_birds_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py) | 256x256 | 0.7671 | 0.7636 | 0.7735 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.json) |
+| [P3_fishes_hrnet_w32](configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py) | 256x256 | 0.6406 | 0.636 | 0.6825 | [ckpt](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.pth) | [log](https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml
new file mode 100644
index 0000000..b560cd8
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/hrnet_animalkingdom.yml
@@ -0,0 +1,86 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: AnimalKingdom_P1
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.6323
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256-08bf96cb_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P2
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.3741
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256-2396cc58_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P3_amphibian
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.5358
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256-845085f9_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P3_bird
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.7671
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256-566feff5_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P3_fish
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.6406
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256-76c3999f_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P3_mammal
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.571
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256-e8aadf02_20230519.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AnimalKingdom_P3_reptile
+ Name: td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256
+ Results:
+ - Dataset: AnimalKingdom
+ Metrics:
+ PCK: 0.51
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/animal_2d_keypoint/topdown_heatmap/animal_kingdom/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256-e8440c16_20230519.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py
new file mode 100644
index 0000000..9af6952
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P1-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P1/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P1/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py
new file mode 100644
index 0000000..d7f4238
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P2-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P2/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P2/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py
new file mode 100644
index 0000000..1b54bb7
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_amphibian-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_amphibian/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_amphibian/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py
new file mode 100644
index 0000000..a3e8d9e
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_bird-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_bird/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_bird/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py
new file mode 100644
index 0000000..839e7f9
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_fish-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_fish/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_fish/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py
new file mode 100644
index 0000000..a367693
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_mammal-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_mammal/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_mammal/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py
new file mode 100644
index 0000000..8d2c0d7
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ak/td-hm_hrnet-w32_8xb32-300e_animalkingdom_P3_reptile-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=23,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalKingdomDataset'
+data_mode = 'topdown'
+data_root = 'data/ak/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_reptile/train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ak_P3_reptile/test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [dict(type='PCKAccuracy', thr=0.05), dict(type='AUC')]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md
new file mode 100644
index 0000000..bd17ee5
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.md
@@ -0,0 +1,40 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+Animal-Pose (ICCV'2019)
+
+```bibtex
+@InProceedings{Cao_2019_ICCV,
+ author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
+ title = {Cross-Domain Adaptation for Animal Pose Estimation},
+ booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+ month = {October},
+ year = {2019}
+}
+```
+
+
+
+Results on AnimalPose validation set (1117 instances)
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.740 | 0.959 | 0.833 | 0.780 | 0.965 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256_20210426.log.json) |
+| [pose_hrnet_w48](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.738 | 0.958 | 0.831 | 0.778 | 0.962 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256_20210426.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml
new file mode 100644
index 0000000..cb03cec
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/hrnet_animalpose.yml
@@ -0,0 +1,34 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: Animal-Pose
+ Name: td-hm_hrnet-w32_8xb64-210e_animalpose-256x256
+ Results:
+ - Dataset: Animal-Pose
+ Metrics:
+ AP: 0.740
+ AP@0.5: 0.959
+ AP@0.75: 0.833
+ AR: 0.780
+ AR@0.5: 0.965
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: Animal-Pose
+ Name: td-hm_hrnet-w48_8xb64-210e_animalpose-256x256
+ Results:
+ - Dataset: Animal-Pose
+ Metrics:
+ AP: 0.738
+ AP@0.5: 0.958
+ AP@0.75: 0.831
+ AR: 0.778
+ AR@0.5: 0.962
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_animalpose_256x256-34644726_20210426.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md
new file mode 100644
index 0000000..85d3912
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.md
@@ -0,0 +1,41 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+Animal-Pose (ICCV'2019)
+
+```bibtex
+@InProceedings{Cao_2019_ICCV,
+ author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing},
+ title = {Cross-Domain Adaptation for Animal Pose Estimation},
+ booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
+ month = {October},
+ year = {2019}
+}
+```
+
+
+
+Results on AnimalPose validation set (1117 instances)
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.691 | 0.947 | 0.770 | 0.736 | 0.955 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256_20210426.log.json) |
+| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py) | 256x256 | 0.696 | 0.948 | 0.774 | 0.736 | 0.951 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256_20210426.log.json) |
+| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py) | 256x256 | 0.704 | 0.938 | 0.786 | 0.748 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256_20210426.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml
new file mode 100644
index 0000000..2888981
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/resnet_animalpose.yml
@@ -0,0 +1,51 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: Animal-Pose
+ Name: td-hm_res50_8xb64-210e_animalpose-256x256
+ Results:
+ - Dataset: Animal-Pose
+ Metrics:
+ AP: 0.691
+ AP@0.5: 0.947
+ AP@0.75: 0.770
+ AR: 0.736
+ AR@0.5: 0.955
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_animalpose_256x256-e1f30bff_20210426.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: Animal-Pose
+ Name: td-hm_res101_8xb64-210e_animalpose-256x256
+ Results:
+ - Dataset: Animal-Pose
+ Metrics:
+ AP: 0.696
+ AP@0.5: 0.948
+ AP@0.75: 0.774
+ AR: 0.736
+ AR@0.5: 0.951
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_animalpose_256x256-85563f4a_20210426.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: Animal-Pose
+ Name: td-hm_res152_8xb32-210e_animalpose-256x256
+ Results:
+ - Dataset: Animal-Pose
+ Metrics:
+ AP: 0.704
+ AP@0.5: 0.938
+ AP@0.75: 0.786
+ AR: 0.748
+ AR@0.5: 0.946
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_animalpose_256x256-a0a7506c_20210426.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py
new file mode 100644
index 0000000..9dc501d
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py
@@ -0,0 +1,147 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=20,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/animalpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py
new file mode 100644
index 0000000..d671f61
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py
@@ -0,0 +1,147 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=20,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/animalpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py
new file mode 100644
index 0000000..abc4127
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=20,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/animalpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py
new file mode 100644
index 0000000..9e8b4bf
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=20,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/animalpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py
new file mode 100644
index 0000000..953b9e9
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=20,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AnimalPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/animalpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/animalpose_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric', ann_file=data_root + 'annotations/animalpose_val.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..66391d2
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,220 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md
new file mode 100644
index 0000000..8d1c8d2
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md
@@ -0,0 +1,58 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+AP-10K (NeurIPS'2021)
+
+```bibtex
+@misc{yu2021ap10k,
+ title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+ author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+ year={2021},
+ eprint={2108.12617},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+Results on AP-10K validation set
+
+| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log |
+| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: |
+| [pose_cspnext_m](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.703 | 0.944 | 0.776 | 0.513 | 0.710 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml
new file mode 100644
index 0000000..da5785c
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py
+ In Collection: UDP
+ Metadata:
+ Architecture: &id001
+ - UDP
+ - HRNet
+ Training Data: AP-10K
+ Name: cspnext-m_udp_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.703
+ AP@0.5: 0.944
+ AP@0.75: 0.776
+ AP (L): 0.71
+ AP (M): 0.513
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md
new file mode 100644
index 0000000..639509d
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.md
@@ -0,0 +1,41 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+AP-10K (NeurIPS'2021)
+
+```bibtex
+@misc{yu2021ap10k,
+ title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+ author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+ year={2021},
+ eprint={2108.12617},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+Results on AP-10K validation set
+
+| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log |
+| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: |
+| [pose_hrnet_w32](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.722 | 0.935 | 0.789 | 0.557 | 0.729 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.log.json) |
+| [pose_hrnet_w48](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.728 | 0.936 | 0.802 | 0.577 | 0.735 | [ckpt](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml
new file mode 100644
index 0000000..f485dcb
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/hrnet_ap10k.yml
@@ -0,0 +1,34 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: AP-10K
+ Name: td-hm_hrnet-w32_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.722
+ AP@0.5: 0.935
+ AP@0.75: 0.789
+ AP (L): 0.729
+ AP (M): 0.557
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_ap10k_256x256-18aac840_20211029.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: AP-10K
+ Name: td-hm_hrnet-w48_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.728
+ AP@0.5: 0.936
+ AP@0.75: 0.802
+ AP (L): 0.735
+ AP (M): 0.577
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w48_ap10k_256x256-d95ab412_20211029.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md
new file mode 100644
index 0000000..7dcd2e3
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.md
@@ -0,0 +1,41 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+AP-10K (NeurIPS'2021)
+
+```bibtex
+@misc{yu2021ap10k,
+ title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild},
+ author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao},
+ year={2021},
+ eprint={2108.12617},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+Results on AP-10K validation set
+
+| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log |
+| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: |
+| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.680 | 0.926 | 0.738 | 0.552 | 0.687 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.log.json) |
+| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.681 | 0.921 | 0.751 | 0.545 | 0.690 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml
new file mode 100644
index 0000000..29d5b6e
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml
@@ -0,0 +1,35 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: AP-10K
+ Name: td-hm_res50_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.680
+ AP@0.5: 0.926
+ AP@0.75: 0.738
+ AP (L): 0.687
+ AP (M): 0.552
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: AP-10K
+ Name: td-hm_res101_8xb64-210e_ap10k-256x256
+ Results:
+ - Dataset: AP-10K
+ Metrics:
+ AP: 0.681
+ AP@0.5: 0.921
+ AP@0.75: 0.751
+ AP (L): 0.690
+ AP (M): 0.545
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..9d661cc
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,164 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..fe28073
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,164 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..7c1739d
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,135 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py
new file mode 100644
index 0000000..703470f
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py
@@ -0,0 +1,135 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AP10KDataset'
+data_mode = 'topdown'
+data_root = 'data/ap10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-train-split1.json',
+ data_prefix=dict(img='data/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-val-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ap10k-test-split1.json',
+ data_prefix=dict(img='data/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-val-split1.json')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ap10k-test-split1.json')
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md
new file mode 100644
index 0000000..ac07e5a
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.md
@@ -0,0 +1,43 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+Desert Locust (Elife'2019)
+
+```bibtex
+@article{graving2019deepposekit,
+ title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
+ author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
+ journal={Elife},
+ volume={8},
+ pages={e47994},
+ year={2019},
+ publisher={eLife Sciences Publications Limited}
+}
+```
+
+
+
+Results on Desert Locust test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py) | 160x160 | 1.000 | 0.900 | 2.27 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160_20210407.log.json) |
+| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py) | 160x160 | 1.000 | 0.907 | 2.03 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160_20210407.log.json) |
+| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py) | 160x160 | 1.000 | 0.925 | 1.49 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160_20210407.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml
new file mode 100644
index 0000000..e05b37d
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/resnet_locust.yml
@@ -0,0 +1,45 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: Desert Locust
+ Name: td-hm_res50_8xb64-210e_locust-160x160
+ Results:
+ - Dataset: Desert Locust
+ Metrics:
+ AUC: 0.9
+ EPE: 2.27
+ PCK@0.2: 1
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_locust_160x160-9efca22b_20210407.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: Desert Locust
+ Name: td-hm_res101_8xb64-210e_locust-160x160
+ Results:
+ - Dataset: Desert Locust
+ Metrics:
+ AUC: 0.907
+ EPE: 2.03
+ PCK@0.2: 1
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_locust_160x160-d77986b3_20210407.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: Desert Locust
+ Name: td-hm_res152_8xb32-210e_locust-160x160
+ Results:
+ - Dataset: Desert Locust
+ Metrics:
+ AUC: 0.925
+ EPE: 1.49
+ PCK@0.2: 1.0
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_locust_160x160-4ea9b372_20210407.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py
new file mode 100644
index 0000000..7881648
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=35,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'LocustDataset'
+data_mode = 'topdown'
+data_root = 'data/locust/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py
new file mode 100644
index 0000000..c7bdca7
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=35,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'LocustDataset'
+data_mode = 'topdown'
+data_root = 'data/locust/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py
new file mode 100644
index 0000000..309af14
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=35,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'LocustDataset'
+data_mode = 'topdown'
+data_root = 'data/locust/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/locust_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md
new file mode 100644
index 0000000..c49c11e
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.md
@@ -0,0 +1,43 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+Grévy’s Zebra (Elife'2019)
+
+```bibtex
+@article{graving2019deepposekit,
+ title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning},
+ author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D},
+ journal={Elife},
+ volume={8},
+ pages={e47994},
+ year={2019},
+ publisher={eLife Sciences Publications Limited}
+}
+```
+
+
+
+Results on Grévy’s Zebra test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_resnet_50](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.914 | 1.87 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160_20210407.log.json) |
+| [pose_resnet_101](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.915 | 1.83 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160_20210407.log.json) |
+| [pose_resnet_152](/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py) | 160x160 | 1.000 | 0.921 | 1.67 | [ckpt](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth) | [log](https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160_20210407.log.json) |
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml
new file mode 100644
index 0000000..68ffbe7
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/resnet_zebra.yml
@@ -0,0 +1,45 @@
+Models:
+- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: "Gr\xE9vy\u2019s Zebra"
+ Name: td-hm_res50_8xb64-210e_zebra-160x160
+ Results:
+ - Dataset: "Gr\xE9vy\u2019s Zebra"
+ Metrics:
+ AUC: 0.914
+ EPE: 1.87
+ PCK@0.2: 1.0
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_zebra_160x160-5a104833_20210407.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: "Gr\xE9vy\u2019s Zebra"
+ Name: td-hm_res101_8xb64-210e_zebra-160x160
+ Results:
+ - Dataset: "Gr\xE9vy\u2019s Zebra"
+ Metrics:
+ AUC: 0.915
+ EPE: 1.83
+ PCK@0.2: 1.0
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_zebra_160x160-e8cb2010_20210407.pth
+- Config: configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: "Gr\xE9vy\u2019s Zebra"
+ Name: td-hm_res152_8xb32-210e_zebra-160x160
+ Results:
+ - Dataset: "Gr\xE9vy\u2019s Zebra"
+ Metrics:
+ AUC: 0.921
+ EPE: 1.67
+ PCK@0.2: 1.0
+ Task: Animal 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/animal/resnet/res152_zebra_160x160-05de71dd_20210407.pth
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py
new file mode 100644
index 0000000..9a22a33
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=9,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'ZebraDataset'
+data_mode = 'topdown'
+data_root = 'data/zebra/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py
new file mode 100644
index 0000000..d1840b8
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=9,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'ZebraDataset'
+data_mode = 'topdown'
+data_root = 'data/zebra/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py
new file mode 100644
index 0000000..f9dc0e3
--- /dev/null
+++ b/modules/rtmpose/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(160, 160), heatmap_size=(40, 40), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=9,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'ZebraDataset'
+data_mode = 'topdown'
+data_root = 'data/zebra/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/zebra_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/README.md b/modules/rtmpose/configs/body_2d_keypoint/README.md
new file mode 100644
index 0000000..15f244a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/README.md
@@ -0,0 +1,21 @@
+# Human Body 2D Pose Estimation
+
+Multi-person human pose estimation is defined as the task of detecting the poses (or keypoints) of all people from an input image.
+
+Existing approaches can be categorized into top-down and bottom-up approaches.
+
+Top-down methods (e.g. DeepPose) divide the task into two stages: human detection and pose estimation. They perform human detection first, followed by single-person pose estimation given human bounding boxes.
+
+Bottom-up approaches (e.g. Associative Embedding) first detect all the keypoints and then group/associate them into person instances.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_body_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/2d_human_pose_demo.md#2d-human-pose-demo) to run demos.
+
+
+

+
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md
new file mode 100644
index 0000000..5592374
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/README.md
@@ -0,0 +1,9 @@
+# Associative embedding: End-to-end learning for joint detection and grouping (AE)
+
+Associative Embedding is one of the most popular 2D bottom-up pose estimation approaches, that first detect all the keypoints and then group/associate them into person instances.
+
+In order to group all the predicted keypoints to individuals, a tag is also predicted for each detected keypoint. Tags of the same person are similar, while tags of different people are different. Thus the keypoints can be grouped according to the tags.
+
+
+

+
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
new file mode 100644
index 0000000..c20df08
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
@@ -0,0 +1,166 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1.5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=300,
+ milestones=[200, 260],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=192)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', interval=50))
+
+# codec settings
+codec = dict(
+ type='AssociativeEmbedding',
+ input_size=(512, 512),
+ heatmap_size=(128, 128),
+ sigma=2,
+ decode_topk=30,
+ decode_center_shift=0.5,
+ decode_keypoint_order=[
+ 0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16
+ ],
+ decode_max_instances=30)
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='AssociativeEmbeddingHead',
+ in_channels=32,
+ num_keypoints=17,
+ tag_dim=1,
+ tag_per_keypoint=True,
+ deconv_out_channels=None,
+ keypoint_loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ tag_loss=dict(type='AssociativeEmbeddingLoss', loss_weight=0.001),
+ # The heatmap will be resized to the input size before decoding
+ # if ``restore_heatmap_size==True``
+ decoder=dict(codec, heatmap_size=codec['input_size'])),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ shift_heatmap=False,
+ restore_heatmap_size=True,
+ align_corners=False))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = []
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=64,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=24,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none',
+ score_mode='bbox',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md
new file mode 100644
index 0000000..57b6fe0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.md
@@ -0,0 +1,57 @@
+
+
+
+Associative Embedding (NIPS'2017)
+
+```bibtex
+@inproceedings{newell2017associative,
+ title={Associative embedding: End-to-end learning for joint detection and grouping},
+ author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+ booktitle={Advances in neural information processing systems},
+ pages={2277--2287},
+ year={2017}
+}
+```
+
+
+
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [HRNet-w32](/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py) | 512x512 | 0.656 | 0.864 | 0.719 | 0.711 | 0.893 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml
new file mode 100644
index 0000000..e1ca3f8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/associative_embedding/coco/hrnet_coco.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: AE
+ Paper:
+ Title: "Associative embedding: End-to-end learning for joint detection and grouping"
+ URL: https://arxiv.org/abs/1611.05424
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/associative_embedding.md
+Models:
+- Config: configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
+ In Collection: AE
+ Metadata:
+ Architecture:
+ - AE
+ - HRNet
+ Training Data: COCO
+ Name: ae_hrnet-w32_8xb24-300e_coco-512x512
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.656
+ AP@0.5: 0.864
+ AP@0.75: 0.719
+ AR: 0.711
+ AR@0.5: 0.893
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py
new file mode 100644
index 0000000..9c6c4ce
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py
@@ -0,0 +1,164 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=140,
+ milestones=[90, 120],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=160)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='CIDHead',
+ in_channels=480,
+ num_keypoints=17,
+ gfd_channels=32,
+ coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
+ decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
+ contrastive_loss=dict(
+ type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
+ decoder=codec,
+ ),
+ train_cfg=dict(max_train_instances=200),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ shift_heatmap=False,
+ align_corners=False))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='BottomupGetHeatmapMask'),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=64,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=20,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_thr=0.8,
+ score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py
new file mode 100644
index 0000000..f8042b6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py
@@ -0,0 +1,164 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=140,
+ milestones=[90, 120],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=160)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='CIDHead',
+ in_channels=720,
+ num_keypoints=17,
+ gfd_channels=48,
+ coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
+ decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
+ contrastive_loss=dict(
+ type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
+ decoder=codec,
+ ),
+ train_cfg=dict(max_train_instances=200),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ shift_heatmap=False,
+ align_corners=False))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='BottomupGetHeatmapMask'),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=64,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=20,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_thr=0.8,
+ score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md
new file mode 100644
index 0000000..97d83e2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.md
@@ -0,0 +1,42 @@
+
+
+
+CID (CVPR'2022)
+
+```bibtex
+@InProceedings{Wang_2022_CVPR,
+ author = {Wang, Dongkai and Zhang, Shiliang},
+ title = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2022},
+ pages = {11060-11068}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py) | 512x512 | 0.704 | 0.894 | 0.775 | 0.753 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_20230207.json) |
+| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py) | 512x512 | 0.715 | 0.900 | 0.782 | 0.765 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_20230207.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml
new file mode 100644
index 0000000..efd5ee6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/cid/coco/hrnet_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: CID
+ Paper:
+ Title: Contextual Instance Decoupling for Robust Multi-Person Pose Estimation
+ URL: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_CVPR_2022_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/cid.md
+Models:
+- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py
+ In Collection: CID
+ Metadata:
+ Architecture: &id001
+ - CID
+ - HRNet
+ Training Data: COCO
+ Name: cid_hrnet-w32_8xb20-140e_coco-512x512
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.704
+ AP@0.5: 0.894
+ AP@0.75: 0.775
+ AR: 0.753
+ AR@0.5: 0.928
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth
+- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py
+ In Collection: CID
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: cid_hrnet-w48_8xb20-140e_coco-512x512
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.715
+ AP@0.5: 0.9
+ AP@0.75: 0.782
+ AR: 0.765
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md
new file mode 100644
index 0000000..e30a9e9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/README.md
@@ -0,0 +1,22 @@
+# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression (DEKR)
+
+
+
+
+DEKR (CVPR'2021)
+
+```bibtex
+@inproceedings{geng2021bottom,
+ title={Bottom-up human pose estimation via disentangled keypoint regression},
+ author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={14676--14686},
+ year={2021}
+}
+```
+
+
+
+DEKR is a popular 2D bottom-up pose estimation approach that simultaneously detects all the instances and regresses the offsets from the instance centers to joints.
+
+In order to predict the offsets more accurately, the offsets of different joints are regressed using separated branches with deformable convolutional layers. Thus convolution kernels with different shapes are adopted to extract features for the corresponding joint.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py
new file mode 100644
index 0000000..9d94ac5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py
@@ -0,0 +1,189 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=140,
+ milestones=[90, 120],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=80)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='SPR',
+ input_size=(512, 512),
+ heatmap_size=(128, 128),
+ sigma=(4, 2),
+ minimal_diagonal_length=32**0.5,
+ generate_keypoint_heatmaps=True,
+ decode_max_instances=30)
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='DEKRHead',
+ in_channels=480,
+ num_keypoints=17,
+ heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ displacement_loss=dict(
+ type='SoftWeightSmoothL1Loss',
+ use_target_weight=True,
+ supervise_empty=False,
+ beta=1 / 9,
+ loss_weight=0.002,
+ ),
+ decoder=codec,
+ # This rescore net is adapted from the official repo.
+ # If you are not using the original COCO dataset for training,
+ # please make sure to remove the `rescore_cfg` item
+ rescore_cfg=dict(
+ in_channels=74,
+ norm_indexes=(5, 6),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')),
+ ),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ nms_dist_thr=0.05,
+ shift_heatmap=True,
+ align_corners=False))
+
+# enable DDP training when rescore net is used
+find_unused_parameters = True
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='BottomupGetHeatmapMask'),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=32,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=10,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none',
+ score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py
new file mode 100644
index 0000000..c4255fd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py
@@ -0,0 +1,190 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=140,
+ milestones=[90, 120],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=80)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='SPR',
+ input_size=(640, 640),
+ heatmap_size=(160, 160),
+ sigma=(4, 2),
+ minimal_diagonal_length=32**0.5,
+ generate_keypoint_heatmaps=True,
+ decode_max_instances=30)
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='DEKRHead',
+ in_channels=720,
+ num_keypoints=17,
+ num_heatmap_filters=48,
+ heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ displacement_loss=dict(
+ type='SoftWeightSmoothL1Loss',
+ use_target_weight=True,
+ supervise_empty=False,
+ beta=1 / 9,
+ loss_weight=0.002,
+ ),
+ decoder=codec,
+ # This rescore net is adapted from the official repo.
+ # If you are not using the original COCO dataset for training,
+ # please make sure to remove the `rescore_cfg` item
+ rescore_cfg=dict(
+ in_channels=74,
+ norm_indexes=(5, 6),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')),
+ ),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ nms_dist_thr=0.05,
+ shift_heatmap=True,
+ align_corners=False))
+
+# enable DDP training when rescore net is used
+find_unused_parameters = True
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='BottomupGetHeatmapMask'),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=32,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=10,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none',
+ score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md
new file mode 100644
index 0000000..bb2e279
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md
@@ -0,0 +1,58 @@
+
+
+
+DEKR (CVPR'2021)
+
+```bibtex
+@inproceedings{geng2021bottom,
+ title={Bottom-up human pose estimation via disentangled keypoint regression},
+ author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={14676--14686},
+ year={2021}
+}
+```
+
+
+
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [HRNet-w32](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py) | 512x512 | 0.686 | 0.868 | 0.750 | 0.735 | 0.898 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_20221228.json) |
+| [HRNet-w48](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py) | 640x640 | 0.714 | 0.883 | 0.777 | 0.762 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_20230124.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml
new file mode 100644
index 0000000..f34a91d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/coco/hrnet_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: DEKR
+ Paper:
+ Title: Bottom-up human pose estimation via disentangled keypoint regression
+ URL: https://arxiv.org/abs/2104.02300
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dekr.md
+Models:
+- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py
+ In Collection: DEKR
+ Metadata:
+ Architecture: &id001
+ - DEKR
+ - HRNet
+ Training Data: COCO
+ Name: dekr_hrnet-w32_8xb10-140e_coco-512x512
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.686
+ AP@0.5: 0.868
+ AP@0.75: 0.750
+ AR: 0.735
+ AR@0.5: 0.898
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth
+- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py
+ In Collection: DEKR
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: dekr_hrnet-w48_8xb10-140e_coco-640x640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.714
+ AP@0.5: 0.883
+ AP@0.75: 0.777
+ AR: 0.762
+ AR@0.5: 0.915
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py
new file mode 100644
index 0000000..adfc36e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py
@@ -0,0 +1,190 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=20)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=300,
+ milestones=[200, 260],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=80)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='SPR',
+ input_size=(512, 512),
+ heatmap_size=(128, 128),
+ sigma=(4, 2),
+ minimal_diagonal_length=32**0.5,
+ generate_keypoint_heatmaps=True,
+ decode_max_instances=30)
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='DEKRHead',
+ in_channels=480,
+ num_keypoints=14,
+ heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ displacement_loss=dict(
+ type='SoftWeightSmoothL1Loss',
+ use_target_weight=True,
+ supervise_empty=False,
+ beta=1 / 9,
+ loss_weight=0.004,
+ ),
+ decoder=codec,
+ # This rescore net is adapted from the official repo.
+ # If you are not using the original CrowdPose dataset for training,
+ # please make sure to remove the `rescore_cfg` item
+ rescore_cfg=dict(
+ in_channels=59,
+ norm_indexes=(0, 1),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')),
+ ),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ nms_dist_thr=0.05,
+ shift_heatmap=True,
+ align_corners=False))
+
+# enable DDP training when rescore net is used
+find_unused_parameters = True
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'bottomup'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=32,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=10,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ nms_mode='none',
+ score_mode='keypoint',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py
new file mode 100644
index 0000000..89e1a4d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py
@@ -0,0 +1,191 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=300, val_interval=20)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=300,
+ milestones=[200, 260],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=40)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='SPR',
+ input_size=(640, 640),
+ heatmap_size=(160, 160),
+ sigma=(4, 2),
+ minimal_diagonal_length=32**0.5,
+ generate_keypoint_heatmaps=True,
+ decode_max_instances=30)
+
+# model settings
+model = dict(
+ type='BottomupPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384),
+ multiscale_output=True)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='DEKRHead',
+ in_channels=720,
+ num_keypoints=14,
+ num_heatmap_filters=48,
+ heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ displacement_loss=dict(
+ type='SoftWeightSmoothL1Loss',
+ use_target_weight=True,
+ supervise_empty=False,
+ beta=1 / 9,
+ loss_weight=0.004,
+ ),
+ decoder=codec,
+ # This rescore net is adapted from the official repo.
+ # If you are not using the original CrowdPose dataset for training,
+ # please make sure to remove the `rescore_cfg` item
+ rescore_cfg=dict(
+ in_channels=59,
+ norm_indexes=(0, 1),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')),
+ ),
+ test_cfg=dict(
+ multiscale_test=False,
+ flip_test=True,
+ nms_dist_thr=0.05,
+ shift_heatmap=True,
+ align_corners=False))
+
+# enable DDP training when rescore net is used
+find_unused_parameters = True
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'bottomup'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='BottomupRandomAffine', input_size=codec['input_size']),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize',
+ input_size=codec['input_size'],
+ size_factor=32,
+ resize_mode='expand'),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=5,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ nms_mode='none',
+ score_mode='keypoint',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md
new file mode 100644
index 0000000..9b1d251
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md
@@ -0,0 +1,56 @@
+
+
+
+DEKR (CVPR'2021)
+
+```bibtex
+@inproceedings{geng2021bottom,
+ title={Bottom-up human pose estimation via disentangled keypoint regression},
+ author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={14676--14686},
+ year={2021}
+}
+```
+
+
+
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on CrowdPose test without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [HRNet-w32](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py) | 512x512 | 0.663 | 0.857 | 0.714 | 0.740 | 0.671 | 0.576 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_20221228.json) |
+| [HRNet-w48](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py) | 640x640 | 0.679 | 0.869 | 0.731 | 0.753 | 0.688 | 0.593 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_20230128.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml
new file mode 100644
index 0000000..c65d5a9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.yml
@@ -0,0 +1,37 @@
+Models:
+- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py
+ In Collection: DEKR
+ Metadata:
+ Architecture: &id001
+ - DEKR
+ - HRNet
+ Training Data: CrowdPose
+ Name: dekr_hrnet-w32_8xb10-300e_crowdpose-512x512
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.663
+ AP@0.5: 0.857
+ AP@0.75: 0.714
+ AP (E): 0.74
+ AP (M): 0.671
+ AP (L): 0.576
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth
+- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py
+ In Collection: DEKR
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: dekr_hrnet-w48_8xb5-300e_crowdpose-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.679
+ AP@0.5: 0.869
+ AP@0.75: 0.731
+ AP (E): 0.753
+ AP (M): 0.688
+ AP (L): 0.593
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md
new file mode 100644
index 0000000..ffb4271
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.md
@@ -0,0 +1,62 @@
+
+
+
+ED-Pose (ICLR'2023)
+
+```bibtex
+@inproceedings{
+yang2023explicit,
+title={Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation},
+author={Jie Yang and Ailing Zeng and Shilong Liu and Feng Li and Ruimao Zhang and Lei Zhang},
+booktitle={International Conference on Learning Representations},
+year={2023},
+url={https://openreview.net/forum?id=s4WVupnJjmX}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017.
+
+| Arch | BackBone | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :-------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------------------: | :-------------------------------------------: |
+| [edpose_res50_coco](/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py) | ResNet-50 | 0.716 | 0.897 | 0.783 | 0.793 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.json) |
+
+The checkpoint is converted from the official repo. The training of EDPose is not supported yet. It will be supported in the future updates.
+
+The above config follows [Pure Python style](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta). Please install `mmengine>=0.8.2` to use this config.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml
new file mode 100644
index 0000000..bf461e3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_coco.yml
@@ -0,0 +1,26 @@
+Collections:
+- Name: ED-Pose
+ Paper:
+ Title: Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation
+ URL: https://arxiv.org/pdf/2302.01593.pdf
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/edpose.md
+Models:
+- Config: configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py
+ In Collection: ED-Pose
+ Alias: edpose
+ Metadata:
+ Architecture: &id001
+ - ED-Pose
+ - ResNet
+ Training Data: COCO
+ Name: edpose_res50_8xb2-50e_coco-800x1333
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.716
+ AP@0.5: 0.897
+ AP@0.75: 0.783
+ AR: 0.793
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py
new file mode 100644
index 0000000..0940b83
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+ from mmpose.configs._base_.default_runtime import * # noqa
+
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmengine.dataset import DefaultSampler
+from mmengine.model import PretrainedInit
+from mmengine.optim import LinearLR, MultiStepLR
+from torch.nn import GroupNorm
+from torch.optim import Adam
+
+from mmpose.codecs import EDPoseLabel
+from mmpose.datasets import (BottomupRandomChoiceResize, BottomupRandomCrop,
+ CocoDataset, LoadImage, PackPoseInputs,
+ RandomFlip)
+from mmpose.evaluation import CocoMetric
+from mmpose.models import (BottomupPoseEstimator, ChannelMapper, EDPoseHead,
+ PoseDataPreprocessor, ResNet)
+from mmpose.models.utils import FrozenBatchNorm2d
+
+# runtime
+train_cfg.update(max_epochs=50, val_interval=10) # noqa
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type=Adam,
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(type=LinearLR, begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type=MultiStepLR,
+ begin=0,
+ end=140,
+ milestones=[33, 45],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=80)
+
+# hooks
+default_hooks.update( # noqa
+ checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(type=EDPoseLabel, num_select=50, num_keypoints=17)
+
+# model settings
+model = dict(
+ type=BottomupPoseEstimator,
+ data_preprocessor=dict(
+ type=PoseDataPreprocessor,
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True,
+ pad_size_divisor=1),
+ backbone=dict(
+ type=ResNet,
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type=FrozenBatchNorm2d, requires_grad=False),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(
+ type=PretrainedInit, checkpoint='torchvision://resnet50')),
+ neck=dict(
+ type=ChannelMapper,
+ in_channels=[512, 1024, 2048],
+ kernel_size=1,
+ out_channels=256,
+ act_cfg=None,
+ norm_cfg=dict(type=GroupNorm, num_groups=32),
+ num_outs=4),
+ head=dict(
+ type=EDPoseHead,
+ num_queries=900,
+ num_feature_levels=4,
+ num_keypoints=17,
+ as_two_stage=True,
+ encoder=dict(
+ num_layers=6,
+ layer_cfg=dict( # DeformableDetrTransformerEncoderLayer
+ self_attn_cfg=dict( # MultiScaleDeformableAttention
+ embed_dims=256,
+ num_heads=8,
+ num_levels=4,
+ num_points=4,
+ batch_first=True),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=2048,
+ num_fcs=2,
+ ffn_drop=0.0))),
+ decoder=dict(
+ num_layers=6,
+ embed_dims=256,
+ layer_cfg=dict( # DeformableDetrTransformerDecoderLayer
+ self_attn_cfg=dict( # MultiheadAttention
+ embed_dims=256,
+ num_heads=8,
+ batch_first=True),
+ cross_attn_cfg=dict( # MultiScaleDeformableAttention
+ embed_dims=256,
+ batch_first=True),
+ ffn_cfg=dict(
+ embed_dims=256, feedforward_channels=2048, ffn_drop=0.1)),
+ query_dim=4,
+ num_feature_levels=4,
+ num_group=100,
+ num_dn=100,
+ num_box_decoder_layers=2,
+ return_intermediate=True),
+ out_head=dict(num_classes=2),
+ positional_encoding=dict(
+ num_pos_feats=128,
+ temperatureH=20,
+ temperatureW=20,
+ normalize=True),
+ denosing_cfg=dict(
+ dn_box_noise_scale=0.4,
+ dn_label_noise_ratio=0.5,
+ dn_labelbook_size=100,
+ dn_attn_mask_type_list=['match2dn', 'dn2dn', 'group2group']),
+ data_decoder=codec),
+ test_cfg=dict(Pmultiscale_test=False, flip_test=False, num_select=50),
+ train_cfg=dict())
+
+# enable DDP training when rescore net is used
+find_unused_parameters = True
+
+# base dataset settings
+dataset_type = CocoDataset
+data_mode = 'bottomup'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type=LoadImage),
+ dict(type=RandomFlip, direction='horizontal'),
+ dict(
+ type=RandomChoice,
+ transforms=[
+ [
+ dict(
+ type=RandomChoiceResize,
+ scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+ (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+ (736, 1333), (768, 1333), (800, 1333)],
+ keep_ratio=True)
+ ],
+ [
+ dict(
+ type=BottomupRandomChoiceResize,
+ # The radio of all image in train dataset < 7
+ # follow the original implement
+ scales=[(400, 4200), (500, 4200), (600, 4200)],
+ keep_ratio=True),
+ dict(
+ type=BottomupRandomCrop,
+ crop_type='absolute_range',
+ crop_size=(384, 600),
+ allow_negative_crop=True),
+ dict(
+ type=BottomupRandomChoiceResize,
+ scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+ (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+ (736, 1333), (768, 1333), (800, 1333)],
+ keep_ratio=True)
+ ]
+ ]),
+ dict(type=PackPoseInputs),
+]
+
+val_pipeline = [
+ dict(type=LoadImage),
+ dict(
+ type=BottomupRandomChoiceResize,
+ scales=[(800, 1333)],
+ keep_ratio=True,
+ backend='pillow'),
+ dict(
+ type=PackPoseInputs,
+ meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
+ 'img_shape', 'input_size', 'input_center', 'input_scale',
+ 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
+ 'skeleton_links'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=1,
+ num_workers=1,
+ persistent_workers=True,
+ sampler=dict(type=DefaultSampler, shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type=DefaultSampler, shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type=CocoMetric,
+ nms_mode='none',
+ score_mode='keypoint',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md
new file mode 100644
index 0000000..967b98e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/README.md
@@ -0,0 +1,15 @@
+# Top-down integral-regression-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, integral regression based methods use a simple integral operation relates and unifies the heatmap and joint regression differentiably, thus obtain the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Integral Human Pose Regression](https://arxiv.org/abs/1711.08229).
+
+## Results and Models
+
+### COCO Dataset
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :------------------: | :--------: | :---: | :---: | :---------------------------------------------------: |
+| ResNet-50+Debias-IPR | 256x256 | 0.675 | 0.765 | [resnet_debias_coco.md](./coco/resnet_debias_coco.md) |
+| ResNet-50+DSNT | 256x256 | 0.674 | 0.764 | [resnet_dsnt_coco.md](./coco/resnet_dsnt_coco.md) |
+| ResNet-50+IPR | 256x256 | 0.633 | 0.730 | [resnet_ipr_coco.md](./coco/resnet_ipr_coco.md) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py
new file mode 100644
index 0000000..eb60eec
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py
@@ -0,0 +1,134 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='IntegralRegressionLabel',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2.0,
+ normalize=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ ),
+ head=dict(
+ type='DSNTHead',
+ in_channels=2048,
+ in_featuremap_size=(8, 8),
+ num_joints=17,
+ loss=dict(
+ type='MultipleLossWrapper',
+ losses=[
+ dict(type='SmoothL1Loss', use_target_weight=True),
+ dict(type='KeypointMSELoss', use_target_weight=True)
+ ]),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ shift_heatmap=True,
+ ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py
new file mode 100644
index 0000000..9a9cce5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py
@@ -0,0 +1,136 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='IntegralRegressionLabel',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2.0,
+ normalize=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ ),
+ head=dict(
+ type='DSNTHead',
+ in_channels=2048,
+ in_featuremap_size=(8, 8),
+ num_joints=17,
+ debias=True,
+ beta=10.,
+ loss=dict(
+ type='MultipleLossWrapper',
+ losses=[
+ dict(type='SmoothL1Loss', use_target_weight=True),
+ dict(type='JSDiscretLoss', use_target_weight=True)
+ ]),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ shift_heatmap=True,
+ ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py
new file mode 100644
index 0000000..7b262d9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py
@@ -0,0 +1,134 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='IntegralRegressionLabel',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2.0,
+ normalize=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ ),
+ head=dict(
+ type='DSNTHead',
+ in_channels=2048,
+ in_featuremap_size=(8, 8),
+ num_joints=17,
+ loss=dict(
+ type='MultipleLossWrapper',
+ losses=[
+ dict(type='SmoothL1Loss', use_target_weight=True),
+ dict(type='JSDiscretLoss', use_target_weight=True)
+ ]),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ shift_heatmap=True,
+ ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md
new file mode 100644
index 0000000..406d2b6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.md
@@ -0,0 +1,57 @@
+
+
+
+Debias IPR (ICCV'2021)
+
+```bibtex
+@inproceedings{gu2021removing,
+ title={Removing the Bias of Integral Pose Regression},
+ author={Gu, Kerui and Yang, Linlin and Yao, Angela},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={11067--11076},
+ year={2021}
+ }
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [debias-ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py) | 256x256 | 0.675 | 0.872 | 0.740 | 0.765 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml
new file mode 100644
index 0000000..155cdbf
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_debias_coco.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: DebiasIPR
+ Paper:
+ Title: Removing the Bias of Integral Pose Regression
+ URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/debias_ipr.md
+Models:
+- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias--8xb64-210e_coco-256x256.py
+ In Collection: DebiasIPR
+ Metadata:
+ Architecture: &id001
+ - Debias
+ - ResNet
+ Training Data: COCO
+ Name: ipr_res50_debias--8xb64-210e_coco-256x256
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.675
+ AP@0.5: 0.872
+ AP@0.75: 0.74
+ AR: 0.765
+ AR@0.5: 0.928
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md
new file mode 100644
index 0000000..d59266b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.md
@@ -0,0 +1,56 @@
+
+
+
+DSNT (2018)
+
+```bibtex
+@article{nibali2018numerical,
+ title={Numerical Coordinate Regression with Convolutional Neural Networks},
+ author={Nibali, Aiden and He, Zhen and Morgan, Stuart and Prendergast, Luke},
+ journal={arXiv preprint arXiv:1801.07372},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ipr_resnet_50_dsnt](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py) | 256x256 | 0.674 | 0.870 | 0.744 | 0.764 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml
new file mode 100644
index 0000000..fa772e8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_dsnt_coco.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: DSNT
+ Paper:
+ Title: Numerical Coordinate Regression with Convolutional Neural Networks
+ URL: https://arxiv.org/abs/1801.07372v2
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dsnt.md
+Models:
+- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py
+ In Collection: DSNT
+ Metadata:
+ Architecture: &id001
+ - DSNT
+ - ResNet
+ Training Data: COCO
+ Name: ipr_res50_dsnt-8xb64-210e_coco-256x256
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.674
+ AP@0.5: 0.87
+ AP@0.75: 0.744
+ AR: 0.764
+ AR@0.5: 0.928
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md
new file mode 100644
index 0000000..d51e5df
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.md
@@ -0,0 +1,57 @@
+
+
+
+IPR (ECCV'2018)
+
+```bibtex
+@inproceedings{sun2018integral,
+ title={Integral human pose regression},
+ author={Sun, Xiao and Xiao, Bin and Wei, Fangyin and Liang, Shuang and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={529--545},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py) | 256x256 | 0.633 | 0.860 | 0.703 | 0.730 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml
new file mode 100644
index 0000000..d40d190
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/integral_regression/coco/resnet_ipr_coco.yml
@@ -0,0 +1,25 @@
+Collections:
+- Name: IPR
+ Paper:
+ Title: Integral human pose regression
+ URL: https://arxiv.org/abs/1711.08229
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/ipr.md
+Models:
+- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py
+ In Collection: IPR
+ Metadata:
+ Architecture: &id001
+ - IPR
+ - ResNet
+ Training Data: COCO
+ Name: ipr_res50_8xb64-210e_coco-256x256
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.633
+ AP@0.5: 0.86
+ AP@0.75: 0.703
+ AR: 0.73
+ AR@0.5: 0.919
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md
new file mode 100644
index 0000000..cd5b26f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/README.md
@@ -0,0 +1,27 @@
+# RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation
+
+
+
+
+RTMO
+
+```bibtex
+@misc{lu2023rtmo,
+ title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
+ author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
+ year={2023},
+ eprint={2312.07526},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+RTMO is a one-stage pose estimation model that seamlessly integrates coordinate classification into the YOLO architecture. It introduces a Dynamic Coordinate Classifier (DCC) module that handles keypoint localization through dual 1D heatmaps. The DCC employs dynamic bin allocation, localizing the coordinate bins to each predicted bounding box to improve efficiency. It also uses learnable bin representations based on positional encodings, enabling computation of bin-keypoint similarity for precise localization.
+
+RTMO is trained end-to-end using a multi-task loss, with losses for bounding box regression, keypoint heatmap classification via a novel MLE loss, keypoint coordinate proxy regression, and keypoint visibility classification. The MLE loss models annotation uncertainty and balances optimization between easy and hard samples.
+
+During inference, RTMO employs grid-based dense predictions to simultaneously output human detection boxes and poses in a single pass. It selectively decodes heatmaps only for high-scoring grids after NMS, minimizing computational cost.
+
+Compared to prior one-stage methods that regress keypoint coordinates directly, RTMO achieves higher accuracy through coordinate classification while retaining real-time speeds. It also outperforms lightweight top-down approaches for images with many people, as the latter have inference times that scale linearly with the number of human instances.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py
new file mode 100644
index 0000000..e62ef17
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py
@@ -0,0 +1,533 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+# data settings
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[(i, i) for i in range(17)])
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+train_dataset = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file=metafile),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
+ test_mode=False,
+ pipeline=train_pipeline_stage1)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset)
+
+# val datasets
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_dataset=dataset_coco,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 1.0
+deepen_factor = 1.0
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
+ '_20211126_140236-d3bd2b23.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[256, 512, 1024],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=512,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=512,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=512,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-2,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py
new file mode 100644
index 0000000..ffddaab
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py
@@ -0,0 +1,532 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+# data settings
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[(i, i) for i in range(17)])
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+train_dataset = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file=metafile),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
+ test_mode=False,
+ pipeline=train_pipeline_stage1)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset)
+
+# val datasets
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_dataset=dataset_coco,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.75
+deepen_factor = 0.67
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[192, 384, 768],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=384,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=384,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=384,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-2,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py
new file mode 100644
index 0000000..7d132fd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py
@@ -0,0 +1,535 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+# data settings
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[(i, i) for i in range(17)])
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+train_dataset = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file=metafile),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
+ test_mode=False,
+ pipeline=train_pipeline_stage1)
+
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset)
+
+# val datasets
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_dataset=dataset_coco,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.5
+deepen_factor = 0.33
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
+ '20211121_095711-4592a793.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[128, 256, 512],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=256,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=256,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile),
+ use_keypoints_for_center=True),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=256,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1.0,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py
new file mode 100644
index 0000000..ec719a7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py
@@ -0,0 +1,529 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (416, 416)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(416, 416),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(416, 416),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(416, 416),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+# data settings
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[(i, i) for i in range(17)])
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+train_dataset = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file=metafile),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
+ test_mode=False,
+ pipeline=train_pipeline_stage1)
+
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset)
+
+# val datasets
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_dataset=dataset_coco,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.375
+deepen_factor = 0.33
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(320, 640),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_'
+ '20211124_171234-b4047906.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[96, 192, 384],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=192,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=192,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile),
+ use_keypoints_for_center=True),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=192,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1.0,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md
new file mode 100644
index 0000000..59bbfee
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.md
@@ -0,0 +1,132 @@
+
+
+
+RTMO
+
+```bibtex
+@misc{lu2023rtmo,
+ title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
+ author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
+ year={2023},
+ eprint={2312.07526},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+AI Challenger (ArXiv'2017)
+
+```bibtex
+@article{wu2017ai,
+ title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+ author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+ journal={arXiv preprint arXiv:1711.06475},
+ year={2017}
+}
+```
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+
+JHMDB (ICCV'2013)
+
+```bibtex
+@inproceedings{Jhuang:ICCV:2013,
+ title = {Towards understanding action recognition},
+ author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+ booktitle = {International Conf. on Computer Vision (ICCV)},
+ month = Dec,
+ pages = {3192-3199},
+ year = {2013}
+}
+```
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+
+PoseTrack18 (CVPR'2018)
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+ title={Posetrack: A benchmark for human pose estimation and tracking},
+ author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={5167--5176},
+ year={2018}
+}
+```
+
+
+
+
+Halpe (CVPR'2020)
+
+```bibtex
+@inproceedings{li2020pastanet,
+ title={PaStaNet: Toward Human Activity Knowledge Engine},
+ author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
+ booktitle={CVPR},
+ year={2020}
+}
+```
+
+
+
+Results on COCO val2017
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | onnx |
+| :--------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------: | :-------------------------------: | :--------------------------------: |
+| [RTMO-t](/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py) | 640x640 | 0.574 | 0.803 | 0.613 | 0.611 | 0.836 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416_20231219.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.zip) |
+| [RTMO-s](/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py) | 640x640 | 0.686 | 0.879 | 0.744 | 0.723 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.zip) |
+| [RTMO-m](/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py) | 640x640 | 0.726 | 0.899 | 0.790 | 0.763 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.zip) |
+| [RTMO-l](/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py) | 640x640 | 0.748 | 0.911 | 0.813 | 0.786 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.zip) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml
new file mode 100644
index 0000000..046db3b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/body7/rtmo_body7.yml
@@ -0,0 +1,74 @@
+Models:
+- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: &id001
+ - RTMO
+ Training Data: &id002
+ - AI Challenger
+ - COCO
+ - CrowdPose
+ - MPII
+ - sub-JHMDB
+ - Halpe
+ - PoseTrack18
+ Name: rtmo-t_8xb32-600e_body7-416x416
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.574
+ AP@0.5: 0.803
+ AP@0.75: 0.613
+ AR: 0.611
+ AR@0.5: 0.836
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth
+- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmo-s_8xb32-600e_body7-640x640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.686
+ AP@0.5: 0.879
+ AP@0.75: 0.744
+ AR: 0.723
+ AR@0.5: 0.908
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmo-m_16xb16-600e_body7-640x640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.726
+ AP@0.5: 0.899
+ AP@0.75: 0.790
+ AR: 0.763
+ AR@0.5: 0.926
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py
+ In Collection: RTMO
+ Alias: rtmo
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmo-l_16xb16-600e_body7-640x640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.911
+ AP@0.75: 0.813
+ AR: 0.786
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py
new file mode 100644
index 0000000..c310068
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py
@@ -0,0 +1,321 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_coco)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 1.0
+deepen_factor = 1.0
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
+ '_20211126_140236-d3bd2b23.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[256, 512, 1024],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=512,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=512,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=512,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-2,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py
new file mode 100644
index 0000000..fa158e2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py
@@ -0,0 +1,320 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_coco)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.75
+deepen_factor = 0.67
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[192, 384, 768],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=384,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=384,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=384,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-2,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py
new file mode 100644
index 0000000..283cf68
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py
@@ -0,0 +1,323 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=281,
+ T_max=300,
+ end=580,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/coco.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_coco)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 280: {
+ 'proxy_target_cc': True,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.5
+deepen_factor = 0.33
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
+ '20211121_095711-4592a793.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[128, 256, 512],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=256,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=17,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=256,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile),
+ use_keypoints_for_center=True),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=256,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1.0,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md
new file mode 100644
index 0000000..6f8d410
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.md
@@ -0,0 +1,43 @@
+
+
+
+RTMO
+
+```bibtex
+@misc{lu2023rtmo,
+ title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
+ author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
+ year={2023},
+ eprint={2312.07526},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [RTMO-s](/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py) | 640x640 | 0.677 | 0.878 | 0.737 | 0.715 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640_20231211.json) |
+| [RTMO-m](/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py) | 640x640 | 0.709 | 0.890 | 0.778 | 0.747 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640_20231211.json) |
+| [RTMO-l](/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py) | 640x640 | 0.724 | 0.899 | 0.788 | 0.762 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640_20231211.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml
new file mode 100644
index 0000000..6b48504
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/coco/rtmo_coco.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: RTMO
+ Paper:
+ Title: 'RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation'
+ URL: https://arxiv.org/abs/2312.07526
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/rtmo.md
+Models:
+- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: &id001
+ - RTMO
+ Training Data: CrowdPose
+ Name: rtmo-s_8xb32-600e_coco-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.673
+ AP@0.5: 0.878
+ AP@0.75: 0.737
+ AR: 0.715
+ AR@0.5: 0.908
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: rtmo-m_16xb16-600e_coco-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.709
+ AP@0.5: 0.890
+ AP@0.75: 0.778
+ AR: 0.747
+ AR@0.5: 0.920
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: rtmo-l_16xb16-600e_coco-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.724
+ AP@0.5: 0.899
+ AP@0.75: 0.788
+ AR: 0.762
+ AR@0.5: 0.927
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py
new file mode 100644
index 0000000..f1cc089
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py
@@ -0,0 +1,502 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=350,
+ end=349,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=350,
+ T_max=320,
+ end=670,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/crowdpose.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+# data settings
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# mapping
+aic_crowdpose = [(3, 0), (0, 1), (4, 2), (1, 3), (5, 4), (2, 5),
+ (9, 6), (6, 7), (10, 8), (7, 9), (11, 10), (8, 11), (12, 12),
+ (13, 13)]
+
+coco_crowdpose = [
+ (5, 0),
+ (6, 1),
+ (7, 2),
+ (8, 3),
+ (9, 4),
+ (10, 5),
+ (11, 6),
+ (12, 7),
+ (13, 8),
+ (14, 9),
+ (15, 10),
+ (16, 11),
+]
+
+mpii_crowdpose = [
+ (13, 0),
+ (12, 1),
+ (14, 2),
+ (11, 3),
+ (15, 4),
+ (10, 5),
+ (3, 6),
+ (2, 7),
+ (4, 8),
+ (1, 9),
+ (5, 10),
+ (0, 11),
+ (9, 12),
+ (7, 13),
+]
+
+jhmdb_crowdpose = [(4, 0), (3, 1), (8, 2), (7, 3), (12, 4), (11, 5), (6, 6),
+ (5, 7), (10, 8), (9, 9), (14, 10), (13, 11), (2, 12),
+ (0, 13)]
+
+halpe_crowdpose = [
+ (5, 0),
+ (6, 1),
+ (7, 2),
+ (8, 3),
+ (9, 4),
+ (10, 5),
+ (11, 6),
+ (12, 7),
+ (13, 8),
+ (14, 9),
+ (15, 10),
+ (16, 11),
+]
+
+posetrack_crowdpose = [
+ (5, 0),
+ (6, 1),
+ (7, 2),
+ (8, 3),
+ (9, 4),
+ (10, 5),
+ (11, 6),
+ (12, 7),
+ (13, 8),
+ (14, 9),
+ (15, 10),
+ (16, 11),
+ (2, 12),
+ (1, 13),
+]
+
+# train datasets
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=14, mapping=coco_crowdpose)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=14, mapping=aic_crowdpose)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=14,
+ mapping=[(i, i) for i in range(14)])
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=14, mapping=mpii_crowdpose)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=14,
+ mapping=jhmdb_crowdpose)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=14,
+ mapping=halpe_crowdpose)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=14,
+ mapping=posetrack_crowdpose)
+ ],
+)
+
+train_dataset_stage1 = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file=metafile),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ sample_ratio_factor=[1, 0.3, 1, 0.3, 0.3, 0.4, 0.3],
+ test_mode=False,
+ pipeline=train_pipeline_stage1)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset_stage1)
+
+# val datasets
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ score_mode='bbox',
+ nms_mode='none',
+ iou_type='keypoints_crowd',
+ prefix='crowdpose',
+ use_area=False,
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=30,
+ new_train_dataset=dataset_crowdpose,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 350: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 1.0
+deepen_factor = 1.0
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
+ '_20211126_140236-d3bd2b23.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[256, 512, 1024],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=512,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=14,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=512,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=512,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-3,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py
new file mode 100644
index 0000000..d944cca
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py
@@ -0,0 +1,326 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=350,
+ end=349,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=350,
+ T_max=320,
+ end=670,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/crowdpose.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.2,
+ rotate_factor=30,
+ scale_factor=(0.5, 1.5),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.6, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_crowdpose)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ score_mode='bbox',
+ nms_mode='none',
+ iou_type='keypoints_crowd',
+ prefix='crowdpose',
+ use_area=False,
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=30,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 350: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 1.0
+deepen_factor = 1.0
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
+ '_20211126_140236-d3bd2b23.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[256, 512, 1024],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=512,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=14,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=512,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=512,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-3,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py
new file mode 100644
index 0000000..0b925f4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py
@@ -0,0 +1,325 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=350,
+ end=349,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=350,
+ T_max=320,
+ end=670,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/crowdpose.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.2,
+ rotate_factor=30,
+ scale_factor=(0.5, 1.5),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.6, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_crowdpose)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ score_mode='bbox',
+ nms_mode='none',
+ iou_type='keypoints_crowd',
+ prefix='crowdpose',
+ use_area=False,
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=30,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 350: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.75
+deepen_factor = 0.67
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[192, 384, 768],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=384,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=14,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=384,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=384,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-3,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py
new file mode 100644
index 0000000..4121f55
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py
@@ -0,0 +1,326 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ constructor='ForceDefaultOptimWrapperConstructor',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ force_default_settings=True,
+ custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=350,
+ end=349,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ # this scheduler is used to increase the lr from 2e-4 to 5e-4
+ dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=350,
+ T_max=320,
+ end=670,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
+]
+
+# data
+input_size = (640, 640)
+metafile = 'configs/_base_/datasets/crowdpose.py'
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.2,
+ rotate_factor=30,
+ scale_factor=(0.5, 1.5),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.6, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='BottomupGetHeatmapMask', get_invalid=True),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+# train datasets
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_crowdpose)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ score_mode='bbox',
+ nms_mode='none',
+ iou_type='keypoints_crowd',
+ prefix='crowdpose',
+ use_area=False,
+)
+test_evaluator = val_evaluator
+
+# hooks
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=30,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(
+ type='RTMOModeSwitchHook',
+ epoch_attributes={
+ 350: {
+ 'proxy_target_cc': True,
+ 'overlaps_power': 1.0,
+ 'loss_cls.loss_weight': 2.0,
+ 'loss_mle.loss_weight': 5.0,
+ 'loss_oks.loss_weight': 10.0
+ },
+ },
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
+
+# model
+widen_factor = 0.5
+deepen_factor = 0.33
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
+ '20211121_095711-4592a793.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='HybridEncoder',
+ in_channels=[128, 256, 512],
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ hidden_dim=256,
+ output_indices=[1, 2],
+ encoder_cfg=dict(
+ self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+ ffn_cfg=dict(
+ embed_dims=256,
+ feedforward_channels=1024,
+ ffn_drop=0.0,
+ act_cfg=dict(type='GELU'))),
+ projector=dict(
+ type='ChannelMapper',
+ in_channels=[256, 256],
+ kernel_size=1,
+ out_channels=256,
+ act_cfg=None,
+ norm_cfg=dict(type='BN'),
+ num_outs=2)),
+ head=dict(
+ type='RTMOHead',
+ num_keypoints=14,
+ featmap_strides=(16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ cls_feat_channels=256,
+ channels_per_group=36,
+ pose_vec_channels=256,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ assigner=dict(
+ type='SimOTAAssigner',
+ dynamic_k_indicator='oks',
+ oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
+ prior_generator=dict(
+ type='MlvlPointGenerator',
+ centralize_points=True,
+ strides=[16, 32]),
+ dcc_cfg=dict(
+ in_channels=256,
+ feat_channels=128,
+ num_bins=(192, 256),
+ spe_channels=128,
+ gau_cfg=dict(
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ pos_enc='add')),
+ overlaps_power=0.5,
+ loss_cls=dict(
+ type='VariFocalLoss',
+ reduction='sum',
+ use_target_weight=True,
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo=metafile,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_mle=dict(
+ type='MLECCLoss',
+ use_target_weight=True,
+ loss_weight=1e-3,
+ ),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ input_size=input_size,
+ score_thr=0.1,
+ nms_thr=0.65,
+ ))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md
new file mode 100644
index 0000000..8c95c5e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.md
@@ -0,0 +1,44 @@
+
+
+
+RTMO
+
+```bibtex
+@misc{lu2023rtmo,
+ title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
+ author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
+ year={2023},
+ eprint={2312.07526},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on COCO val2017
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [RTMO-s](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py) | 640x640 | 0.673 | 0.882 | 0.729 | 0.737 | 0.682 | 0.591 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640_20231211.json) |
+| [RTMO-m](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.711 | 0.897 | 0.771 | 0.774 | 0.719 | 0.634 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-700e_crowdpose-640x640_20231211.json) |
+| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.732 | 0.907 | 0.793 | 0.792 | 0.741 | 0.653 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640_20231211.json) |
+| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py)\* | 640x640 | 0.838 | 0.947 | 0.893 | 0.888 | 0.847 | 0.772 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640_20231219.json) |
+
+\* indicates the model is trained using a combined dataset composed of AI Challenger, COCO, CrowdPose, Halpe, MPII, PoseTrack18 and sub-JHMDB.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml
new file mode 100644
index 0000000..8c7804b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmo/crowdpose/rtmo_crowdpose.yml
@@ -0,0 +1,70 @@
+Models:
+- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: &id001
+ - RTMO
+ Training Data: CrowdPose
+ Name: rtmo-s_8xb32-700e_crowdpose-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.673
+ AP@0.5: 0.882
+ AP@0.75: 0.729
+ AP (E): 0.737
+ AP (M): 0.682
+ AP (L): 0.591
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: rtmo-m_16xb16-700e_crowdpose-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.711
+ AP@0.5: 0.897
+ AP@0.75: 0.771
+ AP (E): 0.774
+ AP (M): 0.719
+ AP (L): 0.634
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: rtmo-l_16xb16-700e_crowdpose-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.732
+ AP@0.5: 0.907
+ AP@0.75: 0.793
+ AP (E): 0.792
+ AP (M): 0.741
+ AP (L): 0.653
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth
+- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py
+ In Collection: RTMO
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: rtmo-l_16xb16-700e_body7-crowdpose-640x640
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.838
+ AP@0.5: 0.947
+ AP@0.75: 0.893
+ AP (E): 0.888
+ AP (M): 0.847
+ AP (L): 0.772
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md
new file mode 100644
index 0000000..19e4c68
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/README.md
@@ -0,0 +1,57 @@
+# RTMPose
+
+Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
+In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
+Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
+To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
+
+## Results and Models
+
+### COCO Dataset
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :----------------: | :--------: | :---: | :---: | :---------------------------------------: |
+| RTMPose-t | 256x192 | 0.682 | 0.736 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-s | 256x192 | 0.716 | 0.768 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-m | 256x192 | 0.746 | 0.795 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-l | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-t-aic-coco | 256x192 | 0.685 | 0.738 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-s-aic-coco | 256x192 | 0.722 | 0.772 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-m-aic-coco | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-l-aic-coco | 256x192 | 0.765 | 0.813 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-m-aic-coco | 384x288 | 0.770 | 0.816 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+| RTMPose-l-aic-coco | 384x288 | 0.773 | 0.819 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
+
+### MPII Dataset
+
+| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download |
+| :-------: | :--------: | :------: | :------: | :---------------------------------------: |
+| RTMPose-m | 256x256 | 0.907 | 0.348 | [rtmpose_mpii.md](./mpii/rtmpose_mpii.md) |
+
+### CrowdPose Dataset
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------: | :--------: | :---: | :---: | :------------------------------------------------------: |
+| RTMPose-m | 256x192 | 0.706 | 0.788 | [rtmpose_crowdpose.md](./crowdpose/rtmpose_crowdpose.md) |
+
+### Human-Art Dataset
+
+Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
+| RTMPose-s | 256x192 | 0.311 | 0.381 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
+| RTMPose-m | 256x192 | 0.355 | 0.417 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
+| RTMPose-l | 256x192 | 0.378 | 0.442 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
+
+Results on Human-Art validation dataset with ground-truth bounding-box
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
+| RTMPose-s | 256x192 | 0.698 | 0.732 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
+| RTMPose-m | 256x192 | 0.728 | 0.759 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
+| RTMPose-l | 256x192 | 0.753 | 0.783 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py
new file mode 100644
index 0000000..ea72ca7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py
@@ -0,0 +1,553 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-body7_210e-256x192-5e9558ef_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py
new file mode 100644
index 0000000..6ffcd6e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py
@@ -0,0 +1,553 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-body7_210e-384x288-b15bc30d_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py
new file mode 100644
index 0000000..2a069ba
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py
@@ -0,0 +1,535 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (192, 256)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 512
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=5,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=5,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py
new file mode 100644
index 0000000..ae75a5e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py
@@ -0,0 +1,535 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (288, 384)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 512
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py
new file mode 100644
index 0000000..c96ba39
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py
@@ -0,0 +1,553 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-body7_210e-256x192-e0c9327b_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py
new file mode 100644
index 0000000..4118d7d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py
@@ -0,0 +1,553 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-body7_210e-384x288-b9bc2b57_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.0,
+ drop_path=0.0,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py
new file mode 100644
index 0000000..cad89e4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py
@@ -0,0 +1,529 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (192, 256)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 512
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py
new file mode 100644
index 0000000..5c3aff2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py
@@ -0,0 +1,542 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (288, 384)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 512
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+# backend_args = dict(backend='local')
+backend_args = dict(
+ backend='petrel',
+ path_mapping=dict({
+ f'{data_root}': 's3://openmmlab/datasets/',
+ f'{data_root}': 's3://openmmlab/datasets/'
+ }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+# default_hooks = dict(
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py
new file mode 100644
index 0000000..7890c58
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py
@@ -0,0 +1,535 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (192, 256)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 1024
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.6, 1.4],
+ rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py
new file mode 100644
index 0000000..a229d05
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py
@@ -0,0 +1,553 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-s_udp-body7_210e-256x192-8c9ccbdb_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py
new file mode 100644
index 0000000..3404d52
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py
@@ -0,0 +1,536 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (192, 256)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 1024
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.6, 1.4],
+ rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py
new file mode 100644
index 0000000..966c545
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py
@@ -0,0 +1,554 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 20
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+aic_coco = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+]
+
+crowdpose_coco = [
+ (0, 5),
+ (1, 6),
+ (2, 7),
+ (3, 8),
+ (4, 9),
+ (5, 10),
+ (6, 11),
+ (7, 12),
+ (8, 13),
+ (9, 14),
+ (10, 15),
+ (11, 16),
+]
+
+mpii_coco = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+ochuman_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+posetrack_coco = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+test_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+# default_hooks = dict(
+# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+
+test_evaluator = [
+ dict(type='PCKAccuracy', thr=0.1),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py
new file mode 100644
index 0000000..f0c68e7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py
@@ -0,0 +1,535 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 26
+input_size = (288, 384)
+
+# runtime
+max_epochs = 700
+stage2_num_epochs = 20
+base_lr = 4e-3
+train_batch_size = 256
+val_batch_size = 64
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
+ (20, 21), (21, 23), (22, 25)]
+
+aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+ (5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
+ (12, 17), (13, 18)]
+
+crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
+ (13, 18)]
+
+mpii_halpe26 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (8, 18),
+ (9, 17),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_halpe26 = [
+ (0, 18),
+ (2, 17),
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_halpe26 = [(i, i) for i in range(26)]
+
+ochuman_halpe26 = [(i, i) for i in range(17)]
+
+posetrack_halpe26 = [
+ (0, 0),
+ (2, 17),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ dataset_coco,
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_halpe,
+ dataset_posetrack,
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# val datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=coco_halpe26)
+ ],
+)
+
+val_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_val.json',
+ data_prefix=dict(
+ img='pose/ai_challenge/ai_challenger_keypoint'
+ '_validation_20170911/keypoint_validation_images_20170911/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_halpe26)
+ ],
+)
+
+val_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_halpe26)
+ ],
+)
+
+val_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_val.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_halpe26)
+ ],
+)
+
+val_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_test.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_halpe26)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_halpe26)
+ ],
+)
+
+val_ochuman = dict(
+ type='OCHumanDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='ochuman/annotations/'
+ 'ochuman_coco_format_val_range_0.00_1.00.json',
+ data_prefix=dict(img='pose/OCHuman/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=ochuman_halpe26)
+ ],
+)
+
+val_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_val.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_halpe26)
+ ],
+)
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
+ datasets=[
+ val_coco,
+ val_aic,
+ val_crowdpose,
+ val_mpii,
+ val_jhmdb,
+ val_halpe,
+ val_ochuman,
+ val_posetrack,
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
+val_evaluator = test_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md
new file mode 100644
index 0000000..261949c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.md
@@ -0,0 +1,76 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+- Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset.
+- `*` denotes model trained on 7 public datasets:
+ - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+ - [MS COCO](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco)
+ - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+ - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+ - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+ - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+ - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+- `Body8` denotes the addition of the [OCHuman](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#ochuman) dataset, in addition to the 7 datasets mentioned above, for evaluation.
+
+| Config | Input Size | AP
(COCO) | PCK@0.1
(Body8) | AUC
(Body8) | EPE
(Body8) | Params(M) | FLOPS(G) | Download |
+| :--------------------------------------------: | :--------: | :---------------: | :---------------------: | :-----------------: | :-----------------: | :-------: | :------: | :-----------------------------------------------: |
+| [RTMPose-t\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py) | 256x192 | 65.9 | 91.44 | 63.18 | 19.45 | 3.34 | 0.36 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7_420e-256x192-026a1439_20230504.pth) |
+| [RTMPose-s\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py) | 256x192 | 69.7 | 92.45 | 65.15 | 17.85 | 5.47 | 0.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth) |
+| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py) | 256x192 | 74.9 | 94.25 | 68.59 | 15.12 | 13.59 | 1.93 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth) |
+| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py) | 256x192 | 76.7 | 95.08 | 70.14 | 13.79 | 27.66 | 4.16 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth) |
+| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py) | 384x288 | 76.6 | 94.64 | 70.38 | 13.98 | 13.72 | 4.33 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth) |
+| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py) | 384x288 | 78.3 | 95.36 | 71.58 | 13.08 | 27.79 | 9.35 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml
new file mode 100644
index 0000000..768ab0c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-coco.yml
@@ -0,0 +1,98 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: &id002
+ - AI Challenger
+ - COCO
+ - CrowdPose
+ - MPII
+ - sub-JHMDB
+ - Halpe
+ - PoseTrack18
+ Name: rtmpose-t_8xb256-420e_body8-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.659
+ Mean@0.1: 0.914
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7_420e-256x192-026a1439_20230504.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-s_8xb256-420e_body8-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.697
+ Mean@0.1: 0.925
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py
+ In Collection: RTMPose
+ Alias:
+ - human
+ - body
+ - body17
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-420e_body8-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.749
+ Mean@0.1: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb256-420e_body8-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.767
+ Mean@0.1: 0.951
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-420e_body8-384x288
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.766
+ Mean@0.1: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py
+ In Collection: RTMPose
+ Alias: rtmpose-l
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb256-420e_body8-384x288
+ Results:
+ - Dataset: Body8
+ Metrics:
+ AP: 0.783
+ Mean@0.1: 0.964
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md
new file mode 100644
index 0000000..c6ab08d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.md
@@ -0,0 +1,74 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+AlphaPose (TPAMI'2022)
+
+```bibtex
+@article{alphapose,
+ author = {Fang, Hao-Shu and Li, Jiefeng and Tang, Hongyang and Xu, Chao and Zhu, Haoyi and Xiu, Yuliang and Li, Yong-Lu and Lu, Cewu},
+ journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+ title = {AlphaPose: Whole-Body Regional Multi-Person Pose Estimation and Tracking in Real-Time},
+ year = {2022}
+}
+```
+
+
+
+- `*` denotes model trained on 7 public datasets:
+ - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+ - [MS COCO](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco)
+ - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+ - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+ - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+ - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+ - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+- `Body8` denotes the addition of the [OCHuman](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#ochuman) dataset, in addition to the 7 datasets mentioned above, for evaluation.
+
+| Config | Input Size | PCK@0.1
(Body8) | AUC
(Body8) | Params(M) | FLOPS(G) | Download |
+| :--------------------------------------------------------------: | :--------: | :---------------------: | :-----------------: | :-------: | :------: | :-----------------------------------------------------------------: |
+| [RTMPose-t\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py) | 256x192 | 91.89 | 66.35 | 3.51 | 0.37 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7-halpe26_700e-256x192-6020f8a6_20230605.pth) |
+| [RTMPose-s\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py) | 256x192 | 93.01 | 68.62 | 5.70 | 0.70 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7-halpe26_700e-256x192-7f134165_20230605.pth) |
+| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py) | 256x192 | 94.75 | 71.91 | 13.93 | 1.95 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-256x192-4d3e73dd_20230605.pth) |
+| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py) | 256x192 | 95.37 | 73.19 | 28.11 | 4.19 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-256x192-2abb7558_20230605.pth) |
+| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py) | 384x288 | 95.15 | 73.56 | 14.06 | 4.37 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-384x288-89e6428b_20230605.pth) |
+| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py) | 384x288 | 95.56 | 74.38 | 28.24 | 9.40 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-384x288-734182ce_20230605.pth) |
+| [RTMPose-x\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py) | 384x288 | 95.74 | 74.82 | 50.00 | 17.29 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-body7_pt-body7-halpe26_700e-384x288-7fb6e239_20230606.pth) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml
new file mode 100644
index 0000000..aca2527
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/body8/rtmpose_body8-halpe26.yml
@@ -0,0 +1,107 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb1024-700e_body8-halpe26-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: &id002
+ - AI Challenger
+ - COCO
+ - CrowdPose
+ - MPII
+ - sub-JHMDB
+ - Halpe
+ - PoseTrack18
+ Name: rtmpose-t_8xb1024-700e_body8-halpe26-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.919
+ AUC: 0.664
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7-halpe26_700e-256x192-6020f8a6_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb1024-700e_body8-halpe26-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-s_8xb1024-700e_body8-halpe26-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.930
+ AUC: 0.682
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7-halpe26_700e-256x192-7f134165_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-256x192.py
+ In Collection: RTMPose
+ Alias: body26
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb512-700e_body8-halpe26-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.947
+ AUC: 0.719
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-256x192-4d3e73dd_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb512-700e_body8-halpe26-256x192
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.954
+ AUC: 0.732
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-256x192-2abb7558_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb512-700e_body8-halpe26-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb512-700e_body8-halpe26-384x288
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.952
+ AUC: 0.736
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7-halpe26_700e-384x288-89e6428b_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb512-700e_body8-halpe26-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb512-700e_body8-halpe26-384x288
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.956
+ AUC: 0.744
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7-halpe26_700e-384x288-734182ce_20230605.pth
+- Config: configs/body_2d_keypoint/rtmpose/body8/rtmpose-x_8xb256-700e_body8-halpe26-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-x_8xb256-700e_body8-halpe26-384x288
+ Results:
+ - Dataset: Body8
+ Metrics:
+ Mean@0.1: 0.957
+ AUC: 0.748
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-x_simcc-body7_pt-body7-halpe26_700e-384x288-7fb6e239_20230606.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py
new file mode 100644
index 0000000..e1fda25
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py
@@ -0,0 +1,272 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py
new file mode 100644
index 0000000..96be86d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py
@@ -0,0 +1,272 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py
new file mode 100644
index 0000000..0d354d3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file=f'{data_root}person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py
new file mode 100644
index 0000000..24b70dd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py
@@ -0,0 +1,272 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128 * 2,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py
new file mode 100644
index 0000000..7cb0e23
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py
@@ -0,0 +1,272 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128 * 2,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py
new file mode 100644
index 0000000..d0b2325
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file=f'{data_root}person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py
new file mode 100644
index 0000000..635f8c9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py
@@ -0,0 +1,272 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128 * 2,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py
new file mode 100644
index 0000000..ee4f990
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file=f'{data_root}person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py
new file mode 100644
index 0000000..dde95b4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py
@@ -0,0 +1,273 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # Turn off EMA while training the tiny model
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py
new file mode 100644
index 0000000..d4d8180
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py
@@ -0,0 +1,233 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file=f'{data_root}person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # Turn off EMA while training the tiny model
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md
new file mode 100644
index 0000000..312a36b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md
@@ -0,0 +1,71 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-t](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.682 | 0.883 | 0.759 | 0.736 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) |
+| [rtmpose-s](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.716 | 0.892 | 0.789 | 0.768 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) |
+| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.746 | 0.899 | 0.817 | 0.795 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) |
+| [rtmpose-l](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.758 | 0.906 | 0.826 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) |
+| [rtmpose-t-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.685 | 0.880 | 0.761 | 0.738 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.json) |
+| [rtmpose-s-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.722 | 0.892 | 0.794 | 0.772 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.json) |
+| [rtmpose-m-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.758 | 0.903 | 0.826 | 0.806 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.json) |
+| [rtmpose-l-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.765 | 0.906 | 0.835 | 0.813 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.json) |
+| [rtmpose-m-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.770 | 0.908 | 0.833 | 0.816 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.json) |
+| [rtmpose-l-aic-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.773 | 0.907 | 0.835 | 0.819 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml
new file mode 100644
index 0000000..caea707
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.yml
@@ -0,0 +1,170 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: COCO
+ Name: rtmpose-t_8xb256-420e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.682
+ AP@0.5: 0.883
+ AP@0.75: 0.759
+ AR: 0.736
+ AR@0.5: 0.92
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: rtmpose-s_8xb256-420e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.716
+ AP@0.5: 0.892
+ AP@0.75: 0.789
+ AR: 0.768
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: rtmpose-m_8xb256-420e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.746
+ AP@0.5: 0.899
+ AP@0.75: 0.817
+ AR: 0.795
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: rtmpose-l_8xb256-420e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.758
+ AP@0.5: 0.906
+ AP@0.75: 0.826
+ AR: 0.806
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: &id002
+ - COCO
+ - AI Challenger
+ Name: rtmpose-t_8xb256-420e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.685
+ AP@0.5: 0.88
+ AP@0.75: 0.761
+ AR: 0.738
+ AR@0.5: 0.918
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-s_8xb256-420e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.722
+ AP@0.5: 0.892
+ AP@0.75: 0.794
+ AR: 0.772
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-420e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.758
+ AP@0.5: 0.903
+ AP@0.75: 0.826
+ AR: 0.806
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb256-420e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.765
+ AP@0.5: 0.906
+ AP@0.75: 0.835
+ AR: 0.813
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-420e_aic-coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.770
+ AP@0.5: 0.908
+ AP@0.75: 0.833
+ AR: 0.816
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth
+- Config: configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-l_8xb256-420e_aic-coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.773
+ AP@0.5: 0.907
+ AP@0.75: 0.835
+ AR: 0.819
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py
new file mode 100644
index 0000000..9ff68f5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py
@@ -0,0 +1,234 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 5e-4
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=14,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md
new file mode 100644
index 0000000..9314142
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md
@@ -0,0 +1,60 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.706 | 0.841 | 0.765 | 0.799 | 0.719 | 0.582 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml
new file mode 100644
index 0000000..ddfe25f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: CrowdPose
+ Name: rtmpose-t_8xb256-420e_coco-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.706
+ AP@0.5: 0.841
+ AP@0.75: 0.765
+ AP (E): 0.799
+ AP (M): 0.719
+ AP (L): 0.582
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py
new file mode 100644
index 0000000..8ac425a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ # bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ # 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py
new file mode 100644
index 0000000..83a2c44
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ # bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ # 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py
new file mode 100644
index 0000000..87bd833
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ # bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ # 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py
new file mode 100644
index 0000000..e5a8092
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py
@@ -0,0 +1,233 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ # bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ # 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # Turn off EMA while training the tiny model
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md
new file mode 100644
index 0000000..adc2bbd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.md
@@ -0,0 +1,117 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+
+Human-Art (CVPR'2023)
+
+```bibtex
+@inproceedings{ju2023humanart,
+ title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
+ author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
+ year={2023}}
+```
+
+
+
+Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.161 | 0.283 | 0.154 | 0.221 | 0.373 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) |
+| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.249 | 0.395 | 0.256 | 0.323 | 0.485 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) |
+| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.199 | 0.328 | 0.198 | 0.261 | 0.418 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) |
+| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.311 | 0.462 | 0.323 | 0.381 | 0.540 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) |
+| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.239 | 0.372 | 0.243 | 0.302 | 0.455 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) |
+| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.355 | 0.503 | 0.377 | 0.417 | 0.568 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) |
+| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.260 | 0.393 | 0.267 | 0.323 | 0.472 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) |
+| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.378 | 0.521 | 0.399 | 0.442 | 0.584 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) |
+
+Results on Human-Art validation dataset with ground-truth bounding-box
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.444 | 0.725 | 0.453 | 0.488 | 0.750 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) |
+| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.655 | 0.872 | 0.720 | 0.693 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) |
+| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.480 | 0.739 | 0.498 | 0.521 | 0.763 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) |
+| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.698 | 0.893 | 0.768 | 0.732 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) |
+| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.532 | 0.765 | 0.563 | 0.571 | 0.789 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) |
+| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.728 | 0.895 | 0.791 | 0.759 | 0.906 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) |
+| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.564 | 0.789 | 0.602 | 0.599 | 0.808 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) |
+| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.753 | 0.905 | 0.812 | 0.783 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) |
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-t-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.682 | 0.883 | 0.759 | 0.736 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) |
+| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.665 | 0.875 | 0.739 | 0.721 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) |
+| [rtmpose-s-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.716 | 0.892 | 0.789 | 0.768 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) |
+| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.706 | 0.888 | 0.780 | 0.759 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) |
+| [rtmpose-m-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.746 | 0.899 | 0.817 | 0.795 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) |
+| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.725 | 0.892 | 0.795 | 0.775 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) |
+| [rtmpose-l-coco](/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.758 | 0.906 | 0.826 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) |
+| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.748 | 0.901 | 0.816 | 0.796 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) |
+
+Results on COCO val2017 with ground-truth bounding box
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rtmpose-t-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py) | 256x192 | 0.679 | 0.895 | 0.755 | 0.710 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.json) |
+| [rtmpose-s-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py) | 256x192 | 0.725 | 0.916 | 0.798 | 0.753 | 0.925 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.json) |
+| [rtmpose-m-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py) | 256x192 | 0.744 | 0.916 | 0.818 | 0.770 | 0.930 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.json) |
+| [rtmpose-l-humanart-coco](/configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py) | 256x192 | 0.770 | 0.927 | 0.840 | 0.794 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml
new file mode 100644
index 0000000..aaabbcd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/humanart/rtmpose_humanart.yml
@@ -0,0 +1,138 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-l_8xb256-420e_humanart-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: &id002
+ - COCO
+ - Human-Art
+ Name: rtmpose-l_8xb256-420e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.901
+ AP@0.75: 0.816
+ AR: 0.796
+ AR@0.5: 0.938
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.378
+ AP@0.5: 0.521
+ AP@0.75: 0.399
+ AR: 0.442
+ AR@0.5: 0.584
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.753
+ AP@0.5: 0.905
+ AP@0.75: 0.812
+ AR: 0.783
+ AR@0.5: 0.915
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_8xb256-420e_humanart-256x192-389f2cb0_20230611.pth
+- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-m_8xb256-420e_humanart-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-420e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.725
+ AP@0.5: 0.892
+ AP@0.75: 0.795
+ AR: 0.775
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.355
+ AP@0.5: 0.503
+ AP@0.75: 0.377
+ AR: 0.417
+ AR@0.5: 0.568
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.728
+ AP@0.5: 0.895
+ AP@0.75: 0.791
+ AR: 0.759
+ AR@0.5: 0.906
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_8xb256-420e_humanart-256x192-8430627b_20230611.pth
+- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-s_8xb256-420e_humanart-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-s_8xb256-420e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.706
+ AP@0.5: 0.888
+ AP@0.75: 0.780
+ AR: 0.759
+ AR@0.5: 0.928
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.311
+ AP@0.5: 0.462
+ AP@0.75: 0.323
+ AR: 0.381
+ AR@0.5: 0.540
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.698
+ AP@0.5: 0.893
+ AP@0.75: 0.768
+ AR: 0.732
+ AR@0.5: 0.903
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_8xb256-420e_humanart-256x192-5a3ac943_20230611.pth
+- Config: configs/body_2d_keypoint/rtmpose/humanart/rtmpose-t_8xb256-420e_humanart-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-t_8xb256-420e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.665
+ AP@0.5: 0.875
+ AP@0.75: 0.739
+ AR: 0.721
+ AR@0.5: 0.916
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.249
+ AP@0.5: 0.395
+ AP@0.75: 0.256
+ AR: 0.323
+ AR@0.5: 0.485
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.655
+ AP@0.5: 0.872
+ AP@0.75: 0.720
+ AR: 0.693
+ AR@0.5: 0.890
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_8xb256-420e_humanart-256x192-60b68c98_20230612.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..3f5c6af
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,228 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=16,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/',
+# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md
new file mode 100644
index 0000000..b8ffd12
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md
@@ -0,0 +1,43 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean / w. flip | Mean@0.1 | ckpt | log |
+| :------------------------------------------------------- | :--------: | :------------: | :------: | :------------------------------------------------------: | :------------------------------------------------------: |
+| [rtmpose-m](/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py) | 256x256 | 0.907 | 0.348 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml
new file mode 100644
index 0000000..7ff95d0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: MPII
+ Name: rtmpose-m_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.907
+ Mean@0.1: 0.348
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md
new file mode 100644
index 0000000..6d377d0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/README.md
@@ -0,0 +1,20 @@
+# Top-down SimCC-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, SimCC based methods reformulate human pose estimation as two classification tasks for horizontal and vertical coordinates, and uniformly divide each pixel into several bins, thus obtain the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation](https://arxiv.org/abs/2107.03332).
+
+
+

+
+
+## Results and Models
+
+### COCO Dataset
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :---------------------------: | :--------: | :---: | :---: | :-----------------------------------------------: |
+| ResNet-50+SimCC | 384x288 | 0.735 | 0.790 | [resnet_coco.md](./coco/resnet_coco.md) |
+| ResNet-50+SimCC | 256x192 | 0.721 | 0.781 | [resnet_coco.md](./coco/resnet_coco.md) |
+| S-ViPNAS-MobileNet-V3+SimCC | 256x192 | 0.695 | 0.755 | [vipnas_coco.md](./coco/vipnas_coco.md) |
+| MobileNet-V2+SimCC(wo/deconv) | 256x192 | 0.620 | 0.678 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md
new file mode 100644
index 0000000..e6d5b72
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.md
@@ -0,0 +1,55 @@
+
+
+
+SimCC (ECCV'2022)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2107.03332,
+ title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
+ author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
+ year={2021}
+}
+```
+
+
+
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [simcc_mobilenetv2_wo_deconv](/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py) | 256x192 | 0.620 | 0.855 | 0.697 | 0.678 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml
new file mode 100644
index 0000000..a72c85f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py
+ In Collection: SimCC
+ Metadata:
+ Architecture: &id001
+ - SimCC
+ - MobilenetV2
+ Training Data: COCO
+ Name: simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.62
+ AP@0.5: 0.855
+ AP@0.75: 0.697
+ AR: 0.678
+ AR@0.5: 0.902
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md
new file mode 100644
index 0000000..16b5eb0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.md
@@ -0,0 +1,56 @@
+
+
+
+SimCC (ECCV'2022)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2107.03332,
+ title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
+ author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
+ year={2021}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [simcc_resnet_50](/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.721 | 0.897 | 0.798 | 0.781 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.log.json) |
+| [simcc_resnet_50](/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py) | 384x288 | 0.735 | 0.899 | 0.800 | 0.790 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml
new file mode 100644
index 0000000..04c0aca
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml
@@ -0,0 +1,41 @@
+Collections:
+- Name: SimCC
+ Paper:
+ Title: A Simple Coordinate Classification Perspective for Human Pose Estimation
+ URL: https://arxiv.org/abs/2107.03332
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/simcc.md
+Models:
+- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py
+ In Collection: SimCC
+ Metadata:
+ Architecture: &id001
+ - SimCC
+ - ResNet
+ Training Data: COCO
+ Name: simcc_res50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.721
+ AP@0.5: 0.900
+ AP@0.75: 0.798
+ AR: 0.781
+ AR@0.5: 0.937
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth
+- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py
+ In Collection: SimCC
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: simcc_res50_8xb32-140e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.735
+ AP@0.5: 0.899
+ AP@0.75: 0.800
+ AR: 0.790
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..1e5f62a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='mmcls://mobilenet_v2',
+ )),
+ head=dict(
+ type='SimCCHead',
+ in_channels=1280,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ deconv_out_channels=None,
+ loss=dict(type='KLDiscretLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py
new file mode 100644
index 0000000..c724f83
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=140, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[90, 120],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel', input_size=(288, 384), sigma=6.0, simcc_split_ratio=2.0)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='SimCCHead',
+ in_channels=2048,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ loss=dict(type='KLDiscretLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..2a08ff9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,114 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(type='MultiStepLR', milestones=[170, 200], gamma=0.1, by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='SimCCHead',
+ in_channels=2048,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ loss=dict(type='KLDiscretLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..cd2fe16
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py
@@ -0,0 +1,119 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel', input_size=(192, 256), sigma=6.0, simcc_split_ratio=2.0)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ViPNAS_MobileNetV3'),
+ head=dict(
+ type='SimCCHead',
+ in_channels=160,
+ out_channels=17,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ deconv_type='vipnas',
+ deconv_out_channels=(160, 160, 160),
+ deconv_num_groups=(160, 160, 160),
+ loss=dict(type='KLDiscretLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=data_root + 'person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md
new file mode 100644
index 0000000..4d36b73
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.md
@@ -0,0 +1,54 @@
+
+
+
+SimCC (ECCV'2022)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2107.03332,
+ title={SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation},
+ author={Li, Yanjie and Yang, Sen and Liu, Peidong and Zhang, Shoukui and Wang, Yunxiao and Wang, Zhicheng and Yang, Wankou and Xia, Shu-Tao},
+ year={2021}
+}
+```
+
+
+
+
+
+
+ViPNAS (CVPR'2021)
+
+```bibtex
+@article{xu2021vipnas,
+ title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+ author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [simcc_S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.695 | 0.883 | 0.772 | 0.755 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml
new file mode 100644
index 0000000..3790a10
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py
+ In Collection: SimCC
+ Metadata:
+ Architecture: &id001
+ - SimCC
+ - ViPNAS
+ Training Data: COCO
+ Name: simcc_vipnas-mbv3_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.695
+ AP@0.5: 0.883
+ AP@0.75: 0.772
+ AR: 0.755
+ AR@0.5: 0.927
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..74a43d5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel', input_size=(256, 256), sigma=6.0, simcc_split_ratio=2.0)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='SimCCHead',
+ in_channels=2048,
+ out_channels=16,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ deconv_out_channels=None,
+ loss=dict(type='KLDiscretLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..f0b54aa
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,133 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### COCO Dataset
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------------: | :--------: | :---: | :---: | :-------------------------------------------------: |
+| ViTPose-h | 256x192 | 0.790 | 0.840 | [vitpose_coco.md](./coco/vitpose_coco.md) |
+| HRNet-w48+UDP | 256x192 | 0.768 | 0.817 | [hrnet_udp_coco.md](./coco/hrnet_udp_coco.md) |
+| MSPN 4-stg | 256x192 | 0.765 | 0.826 | [mspn_coco.md](./coco/mspn_coco.md) |
+| HRNet-w48+Dark | 256x192 | 0.764 | 0.814 | [hrnet_dark_coco.md](./coco/hrnet_dark_coco.md) |
+| HRNet-w48 | 256x192 | 0.756 | 0.809 | [hrnet_coco.md](./coco/hrnet_coco.md) |
+| HRFormer-B | 256x192 | 0.754 | 0.807 | [hrformer_coco.md](./coco/hrformer_coco.md) |
+| RSN-50-3x | 256x192 | 0.750 | 0.814 | [rsn_coco.md](./coco/rsn_coco.md) |
+| CSPNeXt-l | 256x192 | 0.750 | 0.800 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) |
+| HRNet-w32 | 256x192 | 0.749 | 0.804 | [hrnet_coco.md](./coco/hrnet_coco.md) |
+| Swin-L | 256x192 | 0.743 | 0.798 | [swin_coco.md](./coco/swin_coco.md) |
+| ViTPose-s | 256x192 | 0.739 | 0.792 | [vitpose_coco.md](./coco/vitpose_coco.md) |
+| HRFormer-S | 256x192 | 0.738 | 0.793 | [hrformer_coco.md](./coco/hrformer_coco.md) |
+| Swin-B | 256x192 | 0.737 | 0.794 | [swin_coco.md](./coco/swin_coco.md) |
+| SEResNet-101 | 256x192 | 0.734 | 0.790 | [seresnet_coco.md](./coco/seresnet_coco.md) |
+| SCNet-101 | 256x192 | 0.733 | 0.789 | [scnet_coco.md](./coco/scnet_coco.md) |
+| ResNet-101+Dark | 256x192 | 0.733 | 0.786 | [resnet_dark_coco.md](./coco/resnet_dark_coco.md) |
+| CSPNeXt-m | 256x192 | 0.732 | 0.785 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) |
+| ResNetV1d-101 | 256x192 | 0.732 | 0.785 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) |
+| SEResNet-50 | 256x192 | 0.729 | 0.784 | [seresnet_coco.md](./coco/seresnet_coco.md) |
+| SCNet-50 | 256x192 | 0.728 | 0.784 | [scnet_coco.md](./coco/scnet_coco.md) |
+| ResNet-101 | 256x192 | 0.726 | 0.783 | [resnet_coco.md](./coco/resnet_coco.md) |
+| ResNeXt-101 | 256x192 | 0.726 | 0.781 | [resnext_coco.md](./coco/resnext_coco.md) |
+| HourglassNet | 256x256 | 0.726 | 0.780 | [hourglass_coco.md](./coco/hourglass_coco.md) |
+| ResNeSt-101 | 256x192 | 0.725 | 0.781 | [resnest_coco.md](./coco/resnest_coco.md) |
+| RSN-50 | 256x192 | 0.724 | 0.790 | [rsn_coco.md](./coco/rsn_coco.md) |
+| Swin-T | 256x192 | 0.724 | 0.782 | [swin_coco.md](./coco/swin_coco.md) |
+| MSPN 1-stg | 256x192 | 0.723 | 0.788 | [mspn_coco.md](./coco/mspn_coco.md) |
+| ResNetV1d-50 | 256x192 | 0.722 | 0.777 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) |
+| ResNeSt-50 | 256x192 | 0.720 | 0.775 | [resnest_coco.md](./coco/resnest_coco.md) |
+| ResNet-50 | 256x192 | 0.718 | 0.774 | [resnet_coco.md](./coco/resnet_coco.md) |
+| ResNeXt-50 | 256x192 | 0.715 | 0.771 | [resnext_coco.md](./coco/resnext_coco.md) |
+| PVT-S | 256x192 | 0.714 | 0.773 | [pvt_coco.md](./coco/pvt_coco.md) |
+| CSPNeXt-s | 256x192 | 0.697 | 0.753 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) |
+| LiteHRNet-30 | 256x192 | 0.676 | 0.736 | [litehrnet_coco.md](./coco/litehrnet_coco.md) |
+| CSPNeXt-tiny | 256x192 | 0.665 | 0.723 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) |
+| MobileNet-v2 | 256x192 | 0.648 | 0.709 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) |
+| LiteHRNet-18 | 256x192 | 0.642 | 0.705 | [litehrnet_coco.md](./coco/litehrnet_coco.md) |
+| CPM | 256x192 | 0.627 | 0.689 | [cpm_coco.md](./coco/cpm_coco.md) |
+| ShuffleNet-v2 | 256x192 | 0.602 | 0.668 | [shufflenetv2_coco.md](./coco/shufflenetv2_coco.md) |
+| ShuffleNet-v1 | 256x192 | 0.587 | 0.654 | [shufflenetv1_coco.md](./coco/shufflenetv1_coco.md) |
+| AlexNet | 256x192 | 0.448 | 0.521 | [alexnet_coco.md](./coco/alexnet_coco.md) |
+
+### MPII Dataset
+
+| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download |
+| :------------: | :--------: | :------: | :------: | :-------------------------------------------------: |
+| HRNet-w48+Dark | 256x256 | 0.905 | 0.360 | [hrnet_dark_mpii.md](./mpii/hrnet_dark_mpii.md) |
+| HRNet-w48 | 256x256 | 0.902 | 0.303 | [hrnet_mpii.md](./mpii/cspnext_udp_mpii.md) |
+| HRNet-w48 | 256x256 | 0.901 | 0.337 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) |
+| HRNet-w32 | 256x256 | 0.900 | 0.334 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) |
+| HourglassNet | 256x256 | 0.889 | 0.317 | [hourglass_mpii.md](./mpii/hourglass_mpii.md) |
+| ResNet-152 | 256x256 | 0.889 | 0.303 | [resnet_mpii.md](./mpii/resnet_mpii.md) |
+| ResNetV1d-152 | 256x256 | 0.888 | 0.300 | [resnetv1d_mpii.md](./mpii/resnetv1d_mpii.md) |
+| SCNet-50 | 256x256 | 0.888 | 0.290 | [scnet_mpii.md](./mpii/scnet_mpii.md) |
+| ResNeXt-152 | 256x256 | 0.887 | 0.294 | [resnext_mpii.md](./mpii/resnext_mpii.md) |
+| SEResNet-50 | 256x256 | 0.884 | 0.292 | [seresnet_mpii.md](./mpii/seresnet_mpii.md) |
+| ResNet-50 | 256x256 | 0.882 | 0.286 | [resnet_mpii.md](./mpii/resnet_mpii.md) |
+| ResNetV1d-50 | 256x256 | 0.881 | 0.290 | [resnetv1d_mpii.md](./mpii/resnetv1d_mpii.md) |
+| CPM | 368x368\* | 0.876 | 0.285 | [cpm_mpii.md](./mpii/cpm_mpii.md) |
+| LiteHRNet-30 | 256x256 | 0.869 | 0.271 | [litehrnet_mpii.md](./mpii/litehrnet_mpii.md) |
+| LiteHRNet-18 | 256x256 | 0.859 | 0.260 | [litehrnet_mpii.md](./mpii/litehrnet_mpii.md) |
+| MobileNet-v2 | 256x256 | 0.854 | 0.234 | [mobilenetv2_mpii.md](./mpii/mobilenetv2_mpii.md) |
+| ShuffleNet-v2 | 256x256 | 0.828 | 0.205 | [shufflenetv2_mpii.md](./mpii/shufflenetv2_mpii.md) |
+| ShuffleNet-v1 | 256x256 | 0.824 | 0.195 | [shufflenetv1_mpii.md](./mpii/shufflenetv1_mpii.md) |
+
+### CrowdPose Dataset
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Model | Input Size | AP | AR | Details and Download |
+| :--------: | :--------: | :---: | :---: | :--------------------------------------------------------: |
+| HRNet-w32 | 256x192 | 0.675 | 0.816 | [hrnet_crowdpose.md](./crowdpose/hrnet_crowdpose.md) |
+| CSPNeXt-m | 256x192 | 0.662 | 0.755 | [hrnet_crowdpose.md](./crowdpose/cspnext_udp_crowdpose.md) |
+| ResNet-101 | 256x192 | 0.647 | 0.800 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) |
+| HRNet-w32 | 256x192 | 0.637 | 0.785 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) |
+
+### AIC Dataset
+
+Results on AIC val set with ground-truth bounding boxes.
+
+| Model | Input Size | AP | AR | Details and Download |
+| :--------: | :--------: | :---: | :---: | :----------------------------------: |
+| HRNet-w32 | 256x192 | 0.323 | 0.366 | [hrnet_aic.md](./aic/hrnet_aic.md) |
+| ResNet-101 | 256x192 | 0.294 | 0.337 | [resnet_aic.md](./aic/resnet_aic.md) |
+
+### JHMDB Dataset
+
+| Model | Input Size | PCK(norm. by person size) | PCK (norm. by torso size) | Details and Download |
+| :-------: | :--------: | :-----------------------: | :-----------------------: | :----------------------------------------: |
+| ResNet-50 | 256x256 | 96.0 | 80.1 | [resnet_jhmdb.md](./jhmdb/resnet_jhmdb.md) |
+| CPM | 368x368 | 89.8 | 65.7 | [cpm_jhmdb.md](./jhmdb/cpm_jhmdb.md) |
+
+### PoseTrack2018 Dataset
+
+Results on PoseTrack2018 val with ground-truth bounding boxes.
+
+| Model | Input Size | AP | Details and Download |
+| :-------: | :--------: | :--: | :----------------------------------------------------------: |
+| HRNet-w48 | 256x192 | 84.6 | [hrnet_posetrack18.md](./posetrack18/hrnet_posetrack18.md) |
+| HRNet-w32 | 256x192 | 83.4 | [hrnet_posetrack18.md](./posetrack18/hrnet_posetrack18.md) |
+| ResNet-50 | 256x192 | 81.2 | [resnet_posetrack18.md](./posetrack18/resnet_posetrack18.md) |
+
+### Human-Art Dataset
+
+Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
+| ViTPose-s | 256x192 | 0.381 | 0.448 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) |
+| ViTPose-b | 256x192 | 0.410 | 0.475 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) |
+
+Results on Human-Art validation dataset with ground-truth bounding-box
+
+| Model | Input Size | AP | AR | Details and Download |
+| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
+| ViTPose-s | 256x192 | 0.738 | 0.768 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) |
+| ViTPose-b | 256x192 | 0.759 | 0.790 | [vitpose_humanart.md](./humanart/vitpose_humanart.md) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md
new file mode 100644
index 0000000..282bac8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.md
@@ -0,0 +1,38 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+AI Challenger (ArXiv'2017)
+
+```bibtex
+@article{wu2017ai,
+ title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+ author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+ journal={arXiv preprint arXiv:1711.06475},
+ year={2017}
+}
+```
+
+
+
+Results on AIC val set with ground-truth bounding boxes
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py) | 256x192 | 0.323 | 0.761 | 0.218 | 0.366 | 0.789 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192_20200826.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml
new file mode 100644
index 0000000..9550a4f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/hrnet_aic.yml
@@ -0,0 +1,18 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture:
+ - HRNet
+ Training Data: AI Challenger
+ Name: td-hm_hrnet-w32_8xb64-210e_aic-256x192
+ Results:
+ - Dataset: AI Challenger
+ Metrics:
+ AP: 0.323
+ AP@0.5: 0.761
+ AP@0.75: 0.218
+ AR: 0.366
+ AR@0.5: 0.789
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_aic_256x192-30a4e465_20200826.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md
new file mode 100644
index 0000000..f4a457d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.md
@@ -0,0 +1,55 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+AI Challenger (ArXiv'2017)
+
+```bibtex
+@article{wu2017ai,
+ title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+ author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+ journal={arXiv preprint arXiv:1711.06475},
+ year={2017}
+}
+```
+
+
+
+Results on AIC val set with ground-truth bounding boxes
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py) | 256x192 | 0.294 | 0.736 | 0.172 | 0.337 | 0.762 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192_20200826.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml
new file mode 100644
index 0000000..9c89bc3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/resnet_aic.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: AI Challenger
+ Name: td-hm_res101_8xb64-210e_aic-256x192
+ Results:
+ - Dataset: AI Challenger
+ Metrics:
+ AP: 0.294
+ AP@0.5: 0.736
+ AP@0.75: 0.172
+ AR: 0.337
+ AR@0.5: 0.762
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_aic_256x192-79b35445_20200826.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py
new file mode 100644
index 0000000..1c6d8c5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py
@@ -0,0 +1,151 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=14,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AicDataset'
+data_mode = 'topdown'
+data_root = 'data/aic/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/aic_train.json',
+ data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+ 'keypoint_train_images_20170902/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/aic_val.json',
+ data_prefix=dict(img='ai_challenger_keypoint_validation_20170911/'
+ 'keypoint_validation_images_20170911/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/aic_val.json',
+ use_area=False)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py
new file mode 100644
index 0000000..d368730
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AicDataset'
+data_mode = 'topdown'
+data_root = 'data/aic/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/aic_train.json',
+ data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+ 'keypoint_train_images_20170902/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/aic_val.json',
+ data_prefix=dict(img='ai_challenger_keypoint_validation_20170911/'
+ 'keypoint_validation_images_20170911/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/aic_val.json',
+ use_area=False)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md
new file mode 100644
index 0000000..4cff144
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.md
@@ -0,0 +1,40 @@
+
+
+
+AlexNet (NeurIPS'2012)
+
+```bibtex
+@inproceedings{krizhevsky2012imagenet,
+ title={Imagenet classification with deep convolutional neural networks},
+ author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
+ booktitle={Advances in neural information processing systems},
+ pages={1097--1105},
+ year={2012}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_alexnet](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py) | 256x192 | 0.448 | 0.767 | 0.461 | 0.521 | 0.829 | [ckpt](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192_20200727.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml
new file mode 100644
index 0000000..0451088
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/alexnet_coco.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - AlexNet
+ Training Data: COCO
+ Name: td-hm_alexnet_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.448
+ AP@0.5: 0.767
+ AP@0.75: 0.461
+ AR: 0.521
+ AR@0.5: 0.829
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md
new file mode 100644
index 0000000..c0ecaad
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md
@@ -0,0 +1,41 @@
+
+
+
+CPM (CVPR'2016)
+
+```bibtex
+@inproceedings{wei2016convolutional,
+ title={Convolutional pose machines},
+ author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+ booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+ pages={4724--4732},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py) | 256x192 | 0.627 | 0.862 | 0.709 | 0.689 | 0.906 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192-0e978875_20220920.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192_20220920.log) |
+| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py) | 384x288 | 0.652 | 0.865 | 0.730 | 0.710 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288-165487b8_20221011.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288_20221011.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml
new file mode 100644
index 0000000..aee822b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: CPM
+ Paper:
+ Title: Convolutional pose machines
+ URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/cpm.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py
+ In Collection: CPM
+ Metadata:
+ Architecture: &id001
+ - CPM
+ Training Data: COCO
+ Name: td-hm_cpm_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.627
+ AP@0.5: 0.862
+ AP@0.75: 0.709
+ AR: 0.689
+ AR@0.5: 0.906
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192-0e978875_20220920.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_cpm_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.652
+ AP@0.5: 0.865
+ AP@0.75: 0.730
+ AR: 0.710
+ AR@0.5: 0.907
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288-165487b8_20221011.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py
new file mode 100644
index 0000000..db92dac
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py
@@ -0,0 +1,284 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# keypoint mappings
+keypoint_mapping_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+keypoint_mapping_aic = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ (12, 17),
+ (13, 18),
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=19,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ output_keypoint_indices=[
+ target for _, target in keypoint_mapping_coco
+ ]))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_coco)
+ ],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_aic)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py
new file mode 100644
index 0000000..a6f6239
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py
@@ -0,0 +1,214 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py
new file mode 100644
index 0000000..d51d467
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py
@@ -0,0 +1,284 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# keypoint mappings
+keypoint_mapping_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+keypoint_mapping_aic = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ (12, 17),
+ (13, 18),
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=19,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ output_keypoint_indices=[
+ target for _, target in keypoint_mapping_coco
+ ]))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_coco)
+ ],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_aic)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py
new file mode 100644
index 0000000..a1dd5f6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py
@@ -0,0 +1,214 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py
new file mode 100644
index 0000000..323a9e8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py
@@ -0,0 +1,284 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# keypoint mappings
+keypoint_mapping_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+keypoint_mapping_aic = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ (12, 17),
+ (13, 18),
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-s_imagenet_600e-ea671761.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=512,
+ out_channels=19,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ output_keypoint_indices=[
+ target for _, target in keypoint_mapping_coco
+ ]))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_coco)
+ ],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_aic)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py
new file mode 100644
index 0000000..918b2fa
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py
@@ -0,0 +1,214 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-s_imagenet_600e-ea671761.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=512,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py
new file mode 100644
index 0000000..e25d29b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py
@@ -0,0 +1,284 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# keypoint mappings
+keypoint_mapping_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+keypoint_mapping_aic = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ (12, 17),
+ (13, 18),
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-tiny_imagenet_600e-3a2dd350.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=384,
+ out_channels=19,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ output_keypoint_indices=[
+ target for _, target in keypoint_mapping_coco
+ ]))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type='RepeatDataset',
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_coco)
+ ],
+ ),
+ times=3)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_aic)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py
new file mode 100644
index 0000000..576c3be
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py
@@ -0,0 +1,214 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 105 to 210 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-tiny_imagenet_600e-3a2dd350.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=384,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ # bbox_file='data/coco/person_detection_results/'
+ # 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md
new file mode 100644
index 0000000..29fd080
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md
@@ -0,0 +1,69 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_cspnext_t_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.665 | 0.874 | 0.723 | 0.723 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.json) |
+| [pose_cspnext_s_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.697 | 0.886 | 0.776 | 0.753 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.json) |
+| [pose_cspnext_m_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.732 | 0.896 | 0.806 | 0.785 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.json) |
+| [pose_cspnext_l_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.750 | 0.904 | 0.822 | 0.800 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.json) |
+| [pose_cspnext_t_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.655 | 0.884 | 0.731 | 0.689 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.json) |
+| [pose_cspnext_s_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.700 | 0.905 | 0.783 | 0.733 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.json) |
+| [pose_cspnext_m_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.748 | 0.925 | 0.818 | 0.777 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.json) |
+| [pose_cspnext_l_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.772 | 0.936 | 0.839 | 0.799 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.json) |
+
+Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/techniques.html#darkpose-cvpr-2020).
+
+Flip test and detector is not used in the result of aic-coco training.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml
new file mode 100644
index 0000000..b1d9cd8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.yml
@@ -0,0 +1,139 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: &id001
+ - CSPNeXt
+ - UDP
+ Training Data: COCO
+ Name: cspnext-tiny_udp_8xb256-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.665
+ AP@0.5: 0.874
+ AP@0.75: 0.723
+ AR: 0.723
+ AR@0.5: 0.917
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: cspnext-s_udp_8xb256-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.697
+ AP@0.5: 0.886
+ AP@0.75: 0.776
+ AR: 0.753
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: cspnext-m_udp_8xb256-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.732
+ AP@0.5: 0.896
+ AP@0.75: 0.806
+ AR: 0.785
+ AR@0.5: 0.937
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: cspnext-l_udp_8xb256-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.750
+ AP@0.5: 0.904
+ AP@0.75: 0.822
+ AR: 0.8
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AIC
+ Name: cspnext-tiny_udp_8xb256-210e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.655
+ AP@0.5: 0.884
+ AP@0.75: 0.731
+ AR: 0.689
+ AR@0.5: 0.89
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AIC
+ Name: cspnext-s_udp_8xb256-210e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.7
+ AP@0.5: 0.905
+ AP@0.75: 0.783
+ AR: 0.733
+ AR@0.5: 0.918
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AIC
+ Name: cspnext-m_udp_8xb256-210e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.925
+ AP@0.75: 0.818
+ AR: 0.777
+ AR@0.5: 0.933
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AIC
+ Name: cspnext-l_udp_8xb256-210e_aic-coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.772
+ AP@0.5: 0.936
+ AP@0.75: 0.839
+ AR: 0.799
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md
new file mode 100644
index 0000000..e66d13e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.md
@@ -0,0 +1,42 @@
+
+
+
+Hourglass (ECCV'2016)
+
+```bibtex
+@inproceedings{newell2016stacked,
+ title={Stacked hourglass networks for human pose estimation},
+ author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+ booktitle={European conference on computer vision},
+ pages={483--499},
+ year={2016},
+ organization={Springer}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py) | 256x256 | 0.726 | 0.896 | 0.799 | 0.780 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256_20200709.log.json) |
+| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py) | 384x384 | 0.746 | 0.900 | 0.812 | 0.797 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml
new file mode 100644
index 0000000..23d2a9b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: Hourglass
+ Paper:
+ Title: Stacked hourglass networks for human pose estimation
+ URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hourglass.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture: &id001
+ - Hourglass
+ Training Data: COCO
+ Name: td-hm_hourglass52_8xb32-210e_coco-256x256
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.726
+ AP@0.5: 0.896
+ AP@0.75: 0.799
+ AR: 0.780
+ AR@0.5: 0.934
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hourglass52_8xb32-210e_coco-384x384
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.746
+ AP@0.5: 0.900
+ AP@0.75: 0.812
+ AR: 0.797
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md
new file mode 100644
index 0000000..d49774d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.md
@@ -0,0 +1,43 @@
+
+
+
+HRFormer (NIPS'2021)
+
+```bibtex
+@article{yuan2021hrformer,
+ title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
+ author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
+ journal={Advances in Neural Information Processing Systems},
+ volume={34},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py) | 256x192 | 0.738 | 0.904 | 0.812 | 0.793 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192_20220316.log.json) |
+| [pose_hrformer_small](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py) | 384x288 | 0.757 | 0.905 | 0.824 | 0.807 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288_20220316.log.json) |
+| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py) | 256x192 | 0.754 | 0.906 | 0.827 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) |
+| [pose_hrformer_base](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py) | 384x288 | 0.774 | 0.909 | 0.842 | 0.823 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288_20220316.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml
new file mode 100644
index 0000000..81e8d2b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrformer_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: HRFormer
+ Paper:
+ Title: 'HRFormer: High-Resolution Vision Transformer for Dense Predict'
+ URL: https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrformer.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py
+ In Collection: HRFormer
+ Metadata:
+ Architecture: &id001
+ - HRFormer
+ Training Data: COCO
+ Name: td-hm_hrformer-small_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.738
+ AP@0.5: 0.904
+ AP@0.75: 0.812
+ AR: 0.793
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py
+ In Collection: HRFormer
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrformer-small_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.757
+ AP@0.5: 0.905
+ AP@0.75: 0.824
+ AR: 0.807
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py
+ In Collection: HRFormer
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrformer-base_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.906
+ AP@0.75: 0.827
+ AR: 0.807
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py
+ In Collection: HRFormer
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrformer-base_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.774
+ AP@0.5: 0.909
+ AP@0.75: 0.842
+ AR: 0.823
+ AR@0.5: 0.945
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md
new file mode 100644
index 0000000..010ecdb
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.md
@@ -0,0 +1,62 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+Albumentations (Information'2020)
+
+```bibtex
+@article{buslaev2020albumentations,
+ title={Albumentations: fast and flexible image augmentations},
+ author={Buslaev, Alexander and Iglovikov, Vladimir I and Khvedchenya, Eugene and Parinov, Alex and Druzhinin, Mikhail and Kalinin, Alexandr A},
+ journal={Information},
+ volume={11},
+ number={2},
+ pages={125},
+ year={2020},
+ publisher={Multidisciplinary Digital Publishing Institute}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [coarsedropout](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py) | 256x192 | 0.753 | 0.908 | 0.822 | 0.805 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout_20210320.log.json) |
+| [gridmask](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py) | 256x192 | 0.752 | 0.906 | 0.825 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask_20210320.log.json) |
+| [photometric](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py) | 256x192 | 0.754 | 0.908 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric_20210320.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml
new file mode 100644
index 0000000..b31ef80
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_augmentation_coco.yml
@@ -0,0 +1,56 @@
+Collections:
+- Name: Albumentations
+ Paper:
+ Title: 'Albumentations: fast and flexible image augmentations'
+ URL: https://www.mdpi.com/649002
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/albumentations.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py
+ In Collection: Albumentations
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.753
+ AP@0.5: 0.908
+ AP@0.75: 0.822
+ AR: 0.805
+ AR@0.5: 0.944
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_coarsedropout-0f16a0ce_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py
+ In Collection: Albumentations
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.752
+ AP@0.5: 0.906
+ AP@0.75: 0.825
+ AR: 0.804
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_gridmask-868180df_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py
+ In Collection: Albumentations
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.908
+ AP@0.75: 0.825
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/augmentation/hrnet_w32_coco_256x192_photometric-308cf591_20210320.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md
new file mode 100644
index 0000000..f8c09f3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md
@@ -0,0 +1,43 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py) | 384x288 | 0.761 | 0.908 | 0.826 | 0.811 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288_20220909.log) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.756 | 0.908 | 0.826 | 0.809 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py) | 384x288 | 0.767 | 0.911 | 0.832 | 0.817 | 0.947 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288_20220915.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml
new file mode 100644
index 0000000..525a496
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml
@@ -0,0 +1,124 @@
+Collections:
+- Name: HRNet
+ Paper:
+ Title: Deep high-resolution representation learning for human pose estimation
+ URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnet.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.746
+ AP@0.5: 0.904
+ AP@0.75: 0.819
+ AR: 0.799
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.76
+ AP@0.5: 0.906
+ AP@0.75: 0.83
+ AR: 0.81
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.756
+ AP@0.5: 0.907
+ AP@0.75: 0.825
+ AR: 0.806
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.767
+ AP@0.5: 0.91
+ AP@0.75: 0.831
+ AR: 0.816
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AI Challenger
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.757
+ AP@0.5: 0.907
+ AP@0.75: 0.829
+ AR: 0.809
+ AR@0.5: 0.944
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge-b05435b9_20221025.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data:
+ - COCO
+ - AI Challenger
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.756
+ AP@0.5: 0.906
+ AP@0.75: 0.826
+ AR: 0.807
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine-4ce66880_20221026.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.749
+ AP@0.5: 0.907
+ AP@0.75: 0.822
+ AR: 0.802
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192-f1e84e3b_20220914.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md
new file mode 100644
index 0000000..456acdd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md
@@ -0,0 +1,61 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+
+AI Challenger (ArXiv'2017)
+
+```bibtex
+@article{wu2017ai,
+ title={Ai challenger: A large-scale dataset for going deeper in image understanding},
+ author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
+ journal={arXiv preprint arXiv:1711.06475},
+ year={2017}
+}
+```
+
+
+
+MMPose supports training model with combined datasets. [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) and [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) are two examples.
+
+- [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) leverages AIC data with partial keypoints as auxiliary data to train a COCO model
+- [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets.
+
+Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset.
+
+| Train Set | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :------------------------------------------- | :------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------: | :------------------------------------: |
+| [coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | pose_hrnet_w32 | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) |
+| [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) | pose_hrnet_w32 | 256x192 | 0.756 | 0.907 | 0.828 | 0.809 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge-a9ea6d77_20230818.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge_20230818.json) |
+| [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) | pose_hrnet_w32 | 256x192 | 0.755 | 0.904 | 0.825 | 0.807 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine-458125cc_20230818.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine_20230818.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md
new file mode 100644
index 0000000..89fa371
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.md
@@ -0,0 +1,60 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.907 | 0.825 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192-0e00bf12_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192_20220914.log) |
+| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.766 | 0.907 | 0.829 | 0.815 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288-9bab4c9b_20220917.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288_20220917.log) |
+| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.764 | 0.907 | 0.831 | 0.814 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192-e1ebdd6f_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.772 | 0.911 | 0.833 | 0.821 | 0.948 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288-39c3c381_20220916.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288_20220916.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml
new file mode 100644
index 0000000..ae3d2df
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_dark_coco.yml
@@ -0,0 +1,73 @@
+Collections:
+- Name: DarkPose
+ Paper:
+ Title: Distribution-aware coordinate representation for human pose estimation
+ URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Zhang_Distribution-Aware_Coordinate_Representation_for_Human_Pose_Estimation_CVPR_2020_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/dark.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ - DarkPose
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.757
+ AP@0.5: 0.907
+ AP@0.75: 0.825
+ AR: 0.807
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192-0e00bf12_20220914.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.766
+ AP@0.5: 0.907
+ AP@0.75: 0.829
+ AR: 0.815
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288-9bab4c9b_20220917.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.764
+ AP@0.5: 0.907
+ AP@0.75: 0.831
+ AR: 0.814
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192-e1ebdd6f_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.772
+ AP@0.5: 0.911
+ AP@0.75: 0.833
+ AR: 0.821
+ AR@0.5: 0.948
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288-39c3c381_20220916.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md
new file mode 100644
index 0000000..79aa611
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_fp16_coco.md
@@ -0,0 +1,56 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+FP16 (ArXiv'2017)
+
+```bibtex
+@article{micikevicius2017mixed,
+ title={Mixed precision training},
+ author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
+ journal={arXiv preprint arXiv:1710.03740},
+ year={2017}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.907 | 0.822 | 0.802 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192-f1e84e3b_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192_20220914.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md
new file mode 100644
index 0000000..988df0f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md
@@ -0,0 +1,63 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py) | 256x192 | 0.762 | 0.907 | 0.829 | 0.810 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192-73ede547_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192_20220914.log) |
+| [pose_hrnet_w32_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py) | 384x288 | 0.768 | 0.909 | 0.832 | 0.815 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288-9a3f7c85_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288_20220914.log) |
+| [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py) | 256x192 | 0.768 | 0.908 | 0.833 | 0.817 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192-3feaef8f_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py) | 384x288 | 0.773 | 0.911 | 0.836 | 0.821 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288-70d7ab01_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288_20220913.log) |
+| [pose_hrnet_w32_udp_regress](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py) | 256x192 | 0.759 | 0.907 | 0.827 | 0.813 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192_20220226.log) |
+
+Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/latest/model_zoo_papers/techniques.html#darkpose-cvpr-2020).
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml
new file mode 100644
index 0000000..3971f52
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.yml
@@ -0,0 +1,90 @@
+Collections:
+- Name: UDP
+ Paper:
+ Title: 'The Devil Is in the Details: Delving Into Unbiased Data Processing for
+ Human Pose Estimation'
+ URL: http://openaccess.thecvf.com/content_CVPR_2020/html/Huang_The_Devil_Is_in_the_Details_Delving_Into_Unbiased_Data_CVPR_2020_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/udp.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ - UDP
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.762
+ AP@0.5: 0.907
+ AP@0.75: 0.829
+ AR: 0.810
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192-73ede547_20220914.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.768
+ AP@0.5: 0.909
+ AP@0.75: 0.832
+ AR: 0.815
+ AR@0.5: 0.945
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288-9a3f7c85_20220914.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.768
+ AP@0.5: 0.908
+ AP@0.75: 0.833
+ AR: 0.817
+ AR@0.5: 0.945
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192-3feaef8f_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.773
+ AP@0.5: 0.911
+ AP@0.75: 0.836
+ AR: 0.821
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288-70d7ab01_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.759
+ AP@0.5: 0.907
+ AP@0.75: 0.827
+ AR: 0.813
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md
new file mode 100644
index 0000000..2bdb62d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.md
@@ -0,0 +1,42 @@
+
+
+
+LiteHRNet (CVPR'2021)
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+ title={Lite-HRNet: A Lightweight High-Resolution Network},
+ author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+ booktitle={CVPR},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py) | 256x192 | 0.642 | 0.867 | 0.719 | 0.705 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192_20211230.log.json) |
+| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py) | 384x288 | 0.676 | 0.876 | 0.746 | 0.735 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288_20211230.log.json) |
+| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py) | 256x192 | 0.676 | 0.880 | 0.756 | 0.736 | 0.922 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192_20210626.log.json) |
+| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py) | 384x288 | 0.700 | 0.883 | 0.776 | 0.758 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288_20210626.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml
new file mode 100644
index 0000000..11ecf92
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: LiteHRNet
+ Paper:
+ Title: 'Lite-HRNet: A Lightweight High-Resolution Network'
+ URL: https://arxiv.org/abs/2104.06403
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/litehrnet.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: &id001
+ - LiteHRNet
+ Training Data: COCO
+ Name: td-hm_litehrnet-18_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.642
+ AP@0.5: 0.867
+ AP@0.75: 0.719
+ AR: 0.705
+ AR@0.5: 0.911
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_256x192-6bace359_20211230.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_litehrnet-18_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.676
+ AP@0.5: 0.876
+ AP@0.75: 0.746
+ AR: 0.735
+ AR@0.5: 0.919
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_coco_384x288-8d4dac48_20211230.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_litehrnet-30_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.676
+ AP@0.5: 0.88
+ AP@0.75: 0.756
+ AR: 0.736
+ AR@0.5: 0.922
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_256x192-4176555b_20210626.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_litehrnet-30_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.7
+ AP@0.5: 0.883
+ AP@0.75: 0.776
+ AR: 0.758
+ AR@0.5: 0.926
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_coco_384x288-a3aef5c4_20210626.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md
new file mode 100644
index 0000000..7df4a42
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md
@@ -0,0 +1,41 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.648 | 0.874 | 0.725 | 0.709 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192-55a04c35_20221016.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192_20221016.log) |
+| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.677 | 0.882 | 0.746 | 0.734 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288-d3ab1457_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288_20221013.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml
new file mode 100644
index 0000000..644a6b6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.yml
@@ -0,0 +1,35 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: COCO
+ Name: td-hm_mobilenetv2_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.648
+ AP@0.5: 0.874
+ AP@0.75: 0.725
+ AR: 0.709
+ AR@0.5: 0.918
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192-55a04c35_20221016.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_mobilenetv2_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.677
+ AP@0.5: 0.882
+ AP@0.75: 0.746
+ AR: 0.734
+ AR@0.5: 0.920
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288-d3ab1457_20221013.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md
new file mode 100644
index 0000000..a67cd63
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.md
@@ -0,0 +1,42 @@
+
+
+
+MSPN (ArXiv'2019)
+
+```bibtex
+@article{li2019rethinking,
+ title={Rethinking on Multi-Stage Networks for Human Pose Estimation},
+ author={Li, Wenbo and Wang, Zhicheng and Yin, Binyi and Peng, Qixiang and Du, Yuming and Xiao, Tianzi and Yu, Gang and Lu, Hongtao and Wei, Yichen and Sun, Jian},
+ journal={arXiv preprint arXiv:1901.00148},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [mspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.723 | 0.895 | 0.794 | 0.788 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192_20201123.log.json) |
+| [2xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.754 | 0.903 | 0.826 | 0.816 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192_20201123.log.json) |
+| [3xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.758 | 0.904 | 0.830 | 0.821 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192_20201123.log.json) |
+| [4xmspn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.765 | 0.906 | 0.835 | 0.826 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192_20201123.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml
new file mode 100644
index 0000000..1165bbc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: MSPN
+ Paper:
+ Title: Rethinking on Multi-Stage Networks for Human Pose Estimation
+ URL: https://arxiv.org/abs/1901.00148
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/mspn.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py
+ In Collection: MSPN
+ Metadata:
+ Architecture: &id001
+ - MSPN
+ Training Data: COCO
+ Name: td-hm_mspn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.723
+ AP@0.5: 0.895
+ AP@0.75: 0.794
+ AR: 0.788
+ AR@0.5: 0.934
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/mspn/mspn50_coco_256x192-8fbfb5d0_20201123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py
+ In Collection: MSPN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_2xmspn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.903
+ AP@0.75: 0.826
+ AR: 0.816
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/mspn/2xmspn50_coco_256x192-c8765a5c_20201123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py
+ In Collection: MSPN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_3xmspn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.758
+ AP@0.5: 0.904
+ AP@0.75: 0.83
+ AR: 0.821
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/mspn/3xmspn50_coco_256x192-e348f18e_20201123.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py
+ In Collection: MSPN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_4xmspn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.765
+ AP@0.5: 0.906
+ AP@0.75: 0.835
+ AR: 0.826
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/mspn/4xmspn50_coco_256x192-7b837afb_20201123.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md
new file mode 100644
index 0000000..74a189d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.md
@@ -0,0 +1,57 @@
+
+
+
+PVT (ICCV'2021)
+
+```bibtex
+@inproceedings{wang2021pyramid,
+ title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
+ author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={568--578},
+ year={2021}
+}
+```
+
+
+
+
+PVTV2 (CVMJ'2022)
+
+```bibtex
+@article{wang2022pvt,
+ title={PVT v2: Improved baselines with Pyramid Vision Transformer},
+ author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
+ journal={Computational Visual Media},
+ pages={1--10},
+ year={2022},
+ publisher={Springer}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_pvt-s](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py) | 256x192 | 0.714 | 0.896 | 0.794 | 0.773 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192-4324a49d_20220501.pth) | [log](https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192_20220501.log.json) |
+| [pose_pvtv2-b2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py) | 256x192 | 0.737 | 0.905 | 0.812 | 0.791 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192-b4212737_20220501.pth) | [log](https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192_20220501.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml
new file mode 100644
index 0000000..202ec81
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/pvt_coco.yml
@@ -0,0 +1,35 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - PVT
+ Training Data: COCO
+ Name: td-hm_pvt-s_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.714
+ AP@0.5: 0.896
+ AP@0.75: 0.794
+ AR: 0.773
+ AR@0.5: 0.936
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/pvt/pvt_small_coco_256x192-4324a49d_20220501.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_pvtv2-b2_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.905
+ AP@0.75: 0.812
+ AR: 0.791
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/pvt/pvtv2_b2_coco_256x192-b4212737_20220501.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md
new file mode 100644
index 0000000..8bee32c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.md
@@ -0,0 +1,46 @@
+
+
+
+ResNeSt (ArXiv'2020)
+
+```bibtex
+@article{zhang2020resnest,
+ title={ResNeSt: Split-Attention Networks},
+ author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
+ journal={arXiv preprint arXiv:2004.08955},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnest_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py) | 256x192 | 0.720 | 0.899 | 0.800 | 0.775 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192_20210320.log.json) |
+| [pose_resnest_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py) | 384x288 | 0.737 | 0.900 | 0.811 | 0.789 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288_20210320.log.json) |
+| [pose_resnest_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py) | 256x192 | 0.725 | 0.900 | 0.807 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192_20210320.log.json) |
+| [pose_resnest_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py) | 384x288 | 0.745 | 0.905 | 0.818 | 0.798 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288_20210320.log.json) |
+| [pose_resnest_200](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py) | 256x192 | 0.731 | 0.905 | 0.812 | 0.787 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192_20210517.log.json) |
+| [pose_resnest_200](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py) | 384x288 | 0.753 | 0.907 | 0.827 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288_20210517.log.json) |
+| [pose_resnest_269](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.907 | 0.819 | 0.792 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192_20210517.log.json) |
+| [pose_resnest_269](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py) | 384x288 | 0.754 | 0.908 | 0.828 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288_20210517.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml
new file mode 100644
index 0000000..d039829
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnest_coco.yml
@@ -0,0 +1,131 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNeSt
+ Training Data: COCO
+ Name: td-hm_resnest50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.720
+ AP@0.5: 0.899
+ AP@0.75: 0.8
+ AR: 0.775
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_256x192-6e65eece_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest50_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.9
+ AP@0.75: 0.811
+ AR: 0.789
+ AR@0.5: 0.937
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest50_coco_384x288-dcd20436_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.725
+ AP@0.5: 0.9
+ AP@0.75: 0.807
+ AR: 0.781
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_256x192-2ffcdc9d_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest101_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.745
+ AP@0.5: 0.905
+ AP@0.75: 0.818
+ AR: 0.798
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest101_coco_384x288-80660658_20210320.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest200_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.731
+ AP@0.5: 0.905
+ AP@0.75: 0.812
+ AR: 0.787
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_256x192-db007a48_20210517.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest200_8xb16-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.753
+ AP@0.5: 0.907
+ AP@0.75: 0.827
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest200_coco_384x288-b5bb76cb_20210517.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest269_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.907
+ AP@0.75: 0.819
+ AR: 0.792
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_256x192-2a7882ac_20210517.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnest269_8xb16-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.908
+ AP@0.75: 0.828
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnest/resnest269_coco_384x288-b142b9fb_20210517.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md
new file mode 100644
index 0000000..83b7a90
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md
@@ -0,0 +1,68 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.718 | 0.898 | 0.796 | 0.774 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192-04af38ce_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py) | 384x288 | 0.731 | 0.900 | 0.799 | 0.782 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288-7b8db90e_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288_20220923.log) |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.728 | 0.904 | 0.809 | 0.783 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py) | 384x288 | 0.749 | 0.906 | 0.817 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) |
+| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py) | 256x192 | 0.736 | 0.904 | 0.818 | 0.791 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192_20220928.log) |
+| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288_20220927.log) |
+
+The following model is equipped with a visibility prediction head and has been trained using COCO and AIC datasets.
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py) | 256x192 | 0.729 | 0.900 | 0.807 | 0.783 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge-21815b2c_20230726.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml
new file mode 100644
index 0000000..ad6dce9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.yml
@@ -0,0 +1,121 @@
+Collections:
+- Name: SimpleBaseline2D
+ Paper:
+ Title: Simple baselines for human pose estimation and tracking
+ URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: COCO
+ Name: td-hm_res50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.718
+ AP@0.5: 0.898
+ AP@0.75: 0.796
+ AR: 0.774
+ AR@0.5: 0.934
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192-04af38ce_20220923.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res50_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.731
+ AP@0.5: 0.9
+ AP@0.75: 0.799
+ AR: 0.782
+ AR@0.5: 0.937
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288-7b8db90e_20220923.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.728
+ AP@0.5: 0.904
+ AP@0.75: 0.809
+ AR: 0.783
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res101_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.749
+ AP@0.5: 0.906
+ AP@0.75: 0.817
+ AR: 0.799
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res152_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.736
+ AP@0.5: 0.904
+ AP@0.75: 0.818
+ AR: 0.791
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res152_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.75
+ AP@0.5: 0.908
+ AP@0.75: 0.821
+ AR: 0.8
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res50_fp16-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.716
+ AP@0.5: 0.898
+ AP@0.75: 0.798
+ AR: 0.772
+ AR@0.5: 0.937
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192-463da051_20220927.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md
new file mode 100644
index 0000000..9156fb4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md
@@ -0,0 +1,79 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.724 | 0.897 | 0.797 | 0.777 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192-c129dcb6_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192_20220926.log) |
+| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.735 | 0.902 | 0.801 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288-8b90b538_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288_20220926.log) |
+| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.733 | 0.900 | 0.810 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192-528ec248_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192_20220926.log) |
+| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.905 | 0.818 | 0.799 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288-487d40a4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288_20220926.log) |
+| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.743 | 0.906 | 0.819 | 0.796 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192-f754df5f_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192_20221031.log) |
+| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.755 | 0.907 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288-329f8454_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288_20221031.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml
new file mode 100644
index 0000000..c5e156f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.yml
@@ -0,0 +1,100 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ - DarkPose
+ Training Data: COCO
+ Name: td-hm_res50_dark-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.724
+ AP@0.5: 0.897
+ AP@0.75: 0.797
+ AR: 0.777
+ AR@0.5: 0.934
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192-c129dcb6_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res50_dark-8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.735
+ AP@0.5: 0.902
+ AP@0.75: 0.801
+ AR: 0.786
+ AR@0.5: 0.938
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288-8b90b538_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res101_dark-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.733
+ AP@0.5: 0.9
+ AP@0.75: 0.81
+ AR: 0.786
+ AR@0.5: 0.938
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192-528ec248_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res101_dark-8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.749
+ AP@0.5: 0.905
+ AP@0.75: 0.818
+ AR: 0.799
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288-487d40a4_20220926.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res152_dark-8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.743
+ AP@0.5: 0.906
+ AP@0.75: 0.819
+ AR: 0.796
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192-f754df5f_20221031.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_res152_dark-8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.757
+ AP@0.5: 0.907
+ AP@0.75: 0.825
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288-329f8454_20221031.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md
new file mode 100644
index 0000000..8785e3b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md
@@ -0,0 +1,73 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+FP16 (ArXiv'2017)
+
+```bibtex
+@article{micikevicius2017mixed,
+ title={Mixed precision training},
+ author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh, Ganesh and others},
+ journal={arXiv preprint arXiv:1710.03740},
+ year={2017}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.716 | 0.898 | 0.798 | 0.772 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192-463da051_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192_20220927.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md
new file mode 100644
index 0000000..59ac34f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md
@@ -0,0 +1,45 @@
+
+
+
+ResNetV1D (CVPR'2019)
+
+```bibtex
+@inproceedings{he2019bag,
+ title={Bag of tricks for image classification with convolutional neural networks},
+ author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={558--567},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.897 | 0.796 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192-27545d63_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192_20221020.log) |
+| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py) | 384x288 | 0.730 | 0.899 | 0.800 | 0.782 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288-0646b46e_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288_20221020.log) |
+| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py) | 256x192 | 0.732 | 0.901 | 0.808 | 0.785 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192-ee9e7212_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192_20221021.log) |
+| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py) | 384x288 | 0.748 | 0.906 | 0.817 | 0.798 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288-d0b5875f_20221028.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288_20221028.log) |
+| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.904 | 0.814 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192-fd49f947_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192_20221021.log) |
+| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py) | 384x288 | 0.751 | 0.907 | 0.821 | 0.801 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288-b9a99602_20221022.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288_20221022.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml
new file mode 100644
index 0000000..4acdfe4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.yml
@@ -0,0 +1,99 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNetV1D
+ Training Data: COCO
+ Name: td-hm_resnetv1d50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.722
+ AP@0.5: 0.897
+ AP@0.75: 0.796
+ AR: 0.777
+ AR@0.5: 0.936
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192-27545d63_20221020.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnetv1d50_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.73
+ AP@0.5: 0.899
+ AP@0.75: 0.8
+ AR: 0.782
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288-0646b46e_20221020.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnetv1d101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.732
+ AP@0.5: 0.901
+ AP@0.75: 0.808
+ AR: 0.785
+ AR@0.5: 0.940
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192-ee9e7212_20221021.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnetv1d101_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.906
+ AP@0.75: 0.817
+ AR: 0.798
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288-d0b5875f_20221028.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnetv1d152_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.904
+ AP@0.75: 0.814
+ AR: 0.790
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192-fd49f947_20221021.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnetv1d152_8xb48-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.751
+ AP@0.5: 0.907
+ AP@0.75: 0.821
+ AR: 0.801
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288-b9a99602_20221022.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md
new file mode 100644
index 0000000..ca7c1b5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.md
@@ -0,0 +1,45 @@
+
+
+
+ResNext (CVPR'2017)
+
+```bibtex
+@inproceedings{xie2017aggregated,
+ title={Aggregated residual transformations for deep neural networks},
+ author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1492--1500},
+ year={2017}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_resnext_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py) | 256x192 | 0.715 | 0.897 | 0.791 | 0.771 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192_20200727.log.json) |
+| [pose_resnext_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py) | 384x288 | 0.724 | 0.899 | 0.794 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288_20200727.log.json) |
+| [pose_resnext_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py) | 256x192 | 0.726 | 0.900 | 0.801 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192_20200727.log.json) |
+| [pose_resnext_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py) | 384x288 | 0.744 | 0.903 | 0.815 | 0.794 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288_20200727.log.json) |
+| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py) | 256x192 | 0.730 | 0.903 | 0.808 | 0.785 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192_20200727.log.json) |
+| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py) | 384x288 | 0.742 | 0.904 | 0.810 | 0.794 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288_20200727.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml
new file mode 100644
index 0000000..29b02d2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/resnext_coco.yml
@@ -0,0 +1,99 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNext
+ Training Data: COCO
+ Name: td-hm_resnext50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.715
+ AP@0.5: 0.897
+ AP@0.75: 0.791
+ AR: 0.771
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_256x192-dcff15f6_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnext50_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.724
+ AP@0.5: 0.899
+ AP@0.75: 0.794
+ AR: 0.777
+ AR@0.5: 0.936
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext50_coco_384x288-412c848f_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnext101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.726
+ AP@0.5: 0.9
+ AP@0.75: 0.801
+ AR: 0.781
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_256x192-c7eba365_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnext101_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.744
+ AP@0.5: 0.903
+ AP@0.75: 0.815
+ AR: 0.794
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext101_coco_384x288-f5eabcd6_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnext152_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.73
+ AP@0.5: 0.903
+ AP@0.75: 0.808
+ AR: 0.785
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_256x192-102449aa_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_resnext152_8xb48-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.742
+ AP@0.5: 0.904
+ AP@0.75: 0.81
+ AR: 0.794
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_coco_384x288-806176df_20200727.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md
new file mode 100644
index 0000000..b5470d1
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md
@@ -0,0 +1,44 @@
+
+
+
+RSN (ECCV'2020)
+
+```bibtex
+@misc{cai2020learning,
+ title={Learning Delicate Local Representations for Multi-Person Pose Estimation},
+ author={Yuanhao Cai and Zhicheng Wang and Zhengxiong Luo and Binyi Yin and Angang Du and Haoqian Wang and Xinyu Zhou and Erjin Zhou and Xiangyu Zhang and Jian Sun},
+ year={2020},
+ eprint={2003.04030},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [rsn_18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py) | 256x192 | 0.704 | 0.887 | 0.781 | 0.773 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192-9049ed09_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192_20221013.log) |
+| [rsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.724 | 0.894 | 0.799 | 0.790 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192-c35901d5_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192_20221013.log) |
+| [2xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.748 | 0.900 | 0.821 | 0.810 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192-9ede341e_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192_20221013.log) |
+| [3xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.750 | 0.900 | 0.824 | 0.814 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192-c3e3c4fe_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192_20221013.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml
new file mode 100644
index 0000000..9ef71e1
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: RSN
+ Paper:
+ Title: Learning Delicate Local Representations for Multi-Person Pose Estimation
+ URL: https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/rsn.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py
+ In Collection: RSN
+ Metadata:
+ Architecture: &id001
+ - RSN
+ Training Data: COCO
+ Name: td-hm_rsn18_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.704
+ AP@0.5: 0.887
+ AP@0.75: 0.781
+ AR: 0.773
+ AR@0.5: 0.927
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192-9049ed09_20221013.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py
+ In Collection: RSN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_rsn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.724
+ AP@0.5: 0.894
+ AP@0.75: 0.799
+ AR: 0.79
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192-c35901d5_20221013.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py
+ In Collection: RSN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_2xrsn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.9
+ AP@0.75: 0.821
+ AR: 0.81
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192-9ede341e_20221013.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py
+ In Collection: RSN
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_3xrsn50_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.75
+ AP@0.5: 0.9
+ AP@0.75: 0.824
+ AR: 0.814
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192-c3e3c4fe_20221013.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md
new file mode 100644
index 0000000..c02ef7d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.md
@@ -0,0 +1,43 @@
+
+
+
+SCNet (CVPR'2020)
+
+```bibtex
+@inproceedings{liu2020improving,
+ title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+ author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={10096--10105},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py) | 256x192 | 0.728 | 0.899 | 0.807 | 0.784 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192_20200709.log.json) |
+| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py) | 384x288 | 0.751 | 0.906 | 0.818 | 0.802 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288_20200709.log.json) |
+| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py) | 256x192 | 0.733 | 0.902 | 0.811 | 0.789 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192_20200709.log.json) |
+| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py) | 384x288 | 0.752 | 0.906 | 0.823 | 0.804 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288_20200709.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml
new file mode 100644
index 0000000..33d1f99
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/scnet_coco.yml
@@ -0,0 +1,66 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SCNet
+ Training Data: COCO
+ Name: td-hm_scnet50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.728
+ AP@0.5: 0.899
+ AP@0.75: 0.807
+ AR: 0.784
+ AR@0.5: 0.938
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_256x192-6920f829_20200709.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: topdown_heatmap_scnet50_coco_384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.751
+ AP@0.5: 0.906
+ AP@0.75: 0.818
+ AR: 0.802
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_coco_384x288-9cacd0ea_20200709.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_scnet101_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.733
+ AP@0.5: 0.902
+ AP@0.75: 0.811
+ AR: 0.789
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_256x192-6d348ef9_20200709.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_scnet101_8xb48-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.752
+ AP@0.5: 0.906
+ AP@0.75: 0.823
+ AR: 0.804
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_coco_384x288-0b6e631b_20200709.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md
new file mode 100644
index 0000000..f08f1f3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.md
@@ -0,0 +1,47 @@
+
+
+
+SEResNet (CVPR'2018)
+
+```bibtex
+@inproceedings{hu2018squeeze,
+ title={Squeeze-and-excitation networks},
+ author={Hu, Jie and Shen, Li and Sun, Gang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={7132--7141},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py) | 256x192 | 0.729 | 0.903 | 0.807 | 0.784 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192_20200727.log.json) |
+| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py) | 384x288 | 0.748 | 0.904 | 0.819 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288_20200727.log.json) |
+| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py) | 256x192 | 0.734 | 0.905 | 0.814 | 0.790 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192_20200727.log.json) |
+| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py) | 384x288 | 0.754 | 0.907 | 0.823 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288_20200727.log.json) |
+| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py) | 256x192 | 0.730 | 0.899 | 0.810 | 0.787 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192_20200727.log.json) |
+| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py) | 384x288 | 0.753 | 0.906 | 0.824 | 0.806 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288_20200727.log.json) |
+
+Note that * means without imagenet pre-training.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml
new file mode 100644
index 0000000..3a4f04a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/seresnet_coco.yml
@@ -0,0 +1,98 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SEResNet
+ Training Data: COCO
+ Name: td-hm_seresnet50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.729
+ AP@0.5: 0.903
+ AP@0.75: 0.807
+ AR: 0.784
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_256x192-25058b66_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_seresnet50_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.748
+ AP@0.5: 0.904
+ AP@0.75: 0.819
+ AR: 0.799
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_coco_384x288-bc0b7680_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_seresnet101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.734
+ AP@0.5: 0.905
+ AP@0.75: 0.814
+ AR: 0.79
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_256x192-83f29c4d_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_seresnet101_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.907
+ AP@0.75: 0.823
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_coco_384x288-48de1709_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_seresnet152_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.73
+ AP@0.5: 0.899
+ AP@0.75: 0.81
+ AR: 0.787
+ AR@0.5: 0.939
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_256x192-1c628d79_20200727.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_seresnet152_8xb48-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.753
+ AP@0.5: 0.906
+ AP@0.75: 0.824
+ AR: 0.806
+ AR@0.5: 0.945
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_coco_384x288-58b23ee8_20200727.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md
new file mode 100644
index 0000000..d331889
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md
@@ -0,0 +1,41 @@
+
+
+
+ShufflenetV1 (CVPR'2018)
+
+```bibtex
+@inproceedings{zhang2018shufflenet,
+ title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
+ author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={6848--6856},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py) | 256x192 | 0.587 | 0.849 | 0.654 | 0.654 | 0.896 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192-7a7ea4f4_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192_20221013.log) |
+| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py) | 384x288 | 0.626 | 0.862 | 0.696 | 0.687 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288-8342f8ba_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288_20221013.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml
new file mode 100644
index 0000000..c20a130
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.yml
@@ -0,0 +1,35 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ShufflenetV1
+ Training Data: COCO
+ Name: td-hm_shufflenetv1_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.587
+ AP@0.5: 0.849
+ AP@0.75: 0.654
+ AR: 0.654
+ AR@0.5: 0.896
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192-7a7ea4f4_20221013.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_shufflenetv1_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.626
+ AP@0.5: 0.862
+ AP@0.75: 0.696
+ AR: 0.687
+ AR@0.5: 0.903
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288-8342f8ba_20221013.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md
new file mode 100644
index 0000000..3c80e76
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md
@@ -0,0 +1,41 @@
+
+
+
+ShufflenetV2 (ECCV'2018)
+
+```bibtex
+@inproceedings{ma2018shufflenet,
+ title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
+ author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={116--131},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.602 | 0.857 | 0.672 | 0.668 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192-51fb931e_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192_20221014.log) |
+| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.638 | 0.866 | 0.707 | 0.699 | 0.910 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288-d30ab55c_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288_20221014.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml
new file mode 100644
index 0000000..3c87873
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.yml
@@ -0,0 +1,35 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ShufflenetV2
+ Training Data: COCO
+ Name: td-hm_shufflenetv2_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.602
+ AP@0.5: 0.857
+ AP@0.75: 0.672
+ AR: 0.668
+ AR@0.5: 0.902
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192-51fb931e_20221014.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_shufflenetv2_8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.638
+ AP@0.5: 0.866
+ AP@0.75: 0.707
+ AR: 0.699
+ AR@0.5: 0.91
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288-d30ab55c_20221014.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md
new file mode 100644
index 0000000..0d142ce
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.md
@@ -0,0 +1,78 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+Swin (ICCV'2021)
+
+```bibtex
+@inproceedings{liu2021swin,
+ title={Swin transformer: Hierarchical vision transformer using shifted windows},
+ author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={10012--10022},
+ year={2021}
+}
+```
+
+
+
+
+
+
+FPN (CVPR'2017)
+
+```bibtex
+@inproceedings{lin2017feature,
+ title={Feature pyramid networks for object detection},
+ author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2117--2125},
+ year={2017}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_swin_t](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.724 | 0.901 | 0.806 | 0.782 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192-eaefe010_20220503.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192_20220503.log.json) |
+| [pose_swin_b](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.904 | 0.820 | 0.794 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192-7432be9e_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192_20220705.log.json) |
+| [pose_swin_b](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py) | 384x288 | 0.759 | 0.910 | 0.832 | 0.811 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288-3abf54f9_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288_20220705.log.json) |
+| [pose_swin_l](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py) | 256x192 | 0.743 | 0.906 | 0.821 | 0.798 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192-642a89db_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192_20220705.log.json) |
+| [pose_swin_l](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py) | 384x288 | 0.763 | 0.912 | 0.830 | 0.814 | 0.949 | [ckpt](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288-c36b7845_20220705.pth) | [log](https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288_20220705.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml
new file mode 100644
index 0000000..569993e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/swin_coco.yml
@@ -0,0 +1,99 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - Swin
+ Training Data: COCO
+ Name: td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.724
+ AP@0.5: 0.901
+ AP@0.75: 0.806
+ AR: 0.782
+ AR@0.5: 0.94
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_t_p4_w7_coco_256x192-eaefe010_20220503.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.904
+ AP@0.75: 0.82
+ AR: 0.794
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_256x192-7432be9e_20220705.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.759
+ AP@0.5: 0.91
+ AP@0.75: 0.832
+ AR: 0.811
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_coco_384x288-3abf54f9_20220705.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.743
+ AP@0.5: 0.906
+ AP@0.75: 0.821
+ AR: 0.798
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_256x192-642a89db_20220705.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.763
+ AP@0.5: 0.912
+ AP@0.75: 0.83
+ AR: 0.814
+ AR@0.5: 0.949
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_l_p4_w7_coco_384x288-c36b7845_20220705.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_b_p4_w7_fpn_coco_256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: topdown_heatmap_swin_b_p4_w7_fpn_coco_256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.741
+ AP@0.5: 0.907
+ AP@0.75: 0.821
+ AR: 0.798
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/swin/swin_b_p4_w7_fpn_coco_256x192-a3b91c45_20220705.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py
new file mode 100644
index 0000000..dbed81b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm-vis_res50_8xb64-210e_coco-aic-256x192-merge.py
@@ -0,0 +1,167 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='VisPredictHead',
+ loss=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ use_sigmoid=True,
+ loss_weight=1e-3,
+ ),
+ pose_cfg=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec)),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root='data/aic/',
+ data_mode=data_mode,
+ ann_file='annotations/aic_train.json',
+ data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+ 'keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ # score_mode='bbox',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..131a4fe
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,152 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [15, 11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MSPN',
+ unit_channels=256,
+ num_stages=2,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ norm_cfg=dict(type='BN'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=2,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3] + [1, 2, 3, 4],
+ loss=([
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ]) * 2,
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..0eb4a71
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [15, 11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='RSN',
+ unit_channels=256,
+ num_stages=2,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ num_steps=4,
+ norm_cfg=dict(type='BN'),
+ ),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=2,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3] + [1, 2, 3, 4],
+ loss=([
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ]) * 2,
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..0d3020d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,152 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [15, 11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MSPN',
+ unit_channels=256,
+ num_stages=3,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ norm_cfg=dict(type='BN'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=3,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3] * 2 + [1, 2, 3, 4],
+ loss=([
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ]) * 3,
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..afc35be
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [15, 11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='RSN',
+ unit_channels=256,
+ num_stages=3,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ num_steps=4,
+ norm_cfg=dict(type='BN'),
+ ),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=3,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3] * 2 + [1, 2, 3, 4],
+ loss=([
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ]) * 3,
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..a3870f4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,152 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [15, 11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MSPN',
+ unit_channels=256,
+ num_stages=4,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ norm_cfg=dict(type='BN'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=4,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3] * 3 + [1, 2, 3, 4],
+ loss=([
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ]) * 4,
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..9cedcfe
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
@@ -0,0 +1,153 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.75,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='base',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.3,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ deconv_out_channels=[],
+ deconv_kernel_sizes=[],
+ final_layer=dict(kernel_size=3, padding=1),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..f672aff
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.75,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='base',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.3,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_base_20230913.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..6c8316b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
@@ -0,0 +1,153 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=32,
+ layer_decay_rate=0.85,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='huge',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.55,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=17,
+ deconv_out_channels=[],
+ deconv_kernel_sizes=[],
+ final_layer=dict(kernel_size=3, padding=1),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..483231d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=32,
+ layer_decay_rate=0.85,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='huge',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.55,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_huge_20230913.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..f5576bc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
@@ -0,0 +1,153 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=24,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='large',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.5,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ deconv_out_channels=[],
+ deconv_kernel_sizes=[],
+ final_layer=dict(kernel_size=3, padding=1),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..d0f6adb
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=24,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='large',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.5,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_large_20230913.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..4eedd6d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch={
+ 'embed_dims': 384,
+ 'num_layers': 12,
+ 'num_heads': 12,
+ 'feedforward_channels': 384 * 4
+ },
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.1,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', scale_factor=4.0, apply_relu=True),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=384,
+ out_channels=17,
+ deconv_out_channels=[],
+ deconv_kernel_sizes=[],
+ final_layer=dict(kernel_size=3, padding=1),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..4918eea
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch={
+ 'embed_dims': 384,
+ 'num_layers': 12,
+ 'num_heads': 12,
+ 'feedforward_channels': 384 * 4
+ },
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.1,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=384,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..dcd903f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(40, 56), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='AlexNet', num_classes=-1),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=256,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..5d71939
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(36, 48), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=17,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=17,
+ out_channels=17,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..662a0fe
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(24, 32), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=17,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=17,
+ out_channels=17,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py
new file mode 100644
index 0000000..b83f3ce
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=17,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py
new file mode 100644
index 0000000..86e35f8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(384, 384), heatmap_size=(96, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=17,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..6537ad0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py
@@ -0,0 +1,174 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRFormer',
+ in_channels=3,
+ norm_cfg=norm_cfg,
+ extra=dict(
+ drop_path_rate=0.2,
+ with_rpe=True,
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(2, ),
+ num_channels=(64, ),
+ num_heads=[2],
+ mlp_ratios=[4]),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2),
+ num_channels=(78, 156),
+ num_heads=[2, 4],
+ mlp_ratios=[4, 4],
+ window_sizes=[7, 7]),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2),
+ num_channels=(78, 156, 312),
+ num_heads=[2, 4, 8],
+ mlp_ratios=[4, 4, 4],
+ window_sizes=[7, 7, 7]),
+ stage4=dict(
+ num_modules=2,
+ num_branches=4,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2, 2),
+ num_channels=(78, 156, 312, 624),
+ num_heads=[2, 4, 8, 16],
+ mlp_ratios=[4, 4, 4, 4],
+ window_sizes=[7, 7, 7, 7])),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrformer_base-32815020_20220226.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=78,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..b055be5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py
@@ -0,0 +1,174 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRFormer',
+ in_channels=3,
+ norm_cfg=norm_cfg,
+ extra=dict(
+ drop_path_rate=0.2,
+ with_rpe=True,
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(2, ),
+ num_channels=(64, ),
+ num_heads=[2],
+ mlp_ratios=[4]),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2),
+ num_channels=(78, 156),
+ num_heads=[2, 4],
+ mlp_ratios=[4, 4],
+ window_sizes=[7, 7]),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2),
+ num_channels=(78, 156, 312),
+ num_heads=[2, 4, 8],
+ mlp_ratios=[4, 4, 4],
+ window_sizes=[7, 7, 7]),
+ stage4=dict(
+ num_modules=2,
+ num_branches=4,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2, 2),
+ num_channels=(78, 156, 312, 624),
+ num_heads=[2, 4, 8, 16],
+ mlp_ratios=[4, 4, 4, 4],
+ window_sizes=[7, 7, 7, 7])),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrformer_base-32815020_20220226.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=78,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..e283ae3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py
@@ -0,0 +1,174 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRFormer',
+ in_channels=3,
+ norm_cfg=norm_cfg,
+ extra=dict(
+ drop_path_rate=0.1,
+ with_rpe=True,
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(2, ),
+ num_channels=(64, ),
+ num_heads=[2],
+ num_mlp_ratios=[4]),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2),
+ num_channels=(32, 64),
+ num_heads=[1, 2],
+ mlp_ratios=[4, 4],
+ window_sizes=[7, 7]),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2),
+ num_channels=(32, 64, 128),
+ num_heads=[1, 2, 4],
+ mlp_ratios=[4, 4, 4],
+ window_sizes=[7, 7, 7]),
+ stage4=dict(
+ num_modules=2,
+ num_branches=4,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2, 2),
+ num_channels=(32, 64, 128, 256),
+ num_heads=[1, 2, 4, 8],
+ mlp_ratios=[4, 4, 4, 4],
+ window_sizes=[7, 7, 7, 7])),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrformer_small-09516375_20220226.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..323a168
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py
@@ -0,0 +1,174 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRFormer',
+ in_channels=3,
+ norm_cfg=norm_cfg,
+ extra=dict(
+ drop_path_rate=0.1,
+ with_rpe=True,
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(2, ),
+ num_channels=(64, ),
+ num_heads=[2],
+ num_mlp_ratios=[4]),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2),
+ num_channels=(32, 64),
+ num_heads=[1, 2],
+ mlp_ratios=[4, 4],
+ window_sizes=[7, 7]),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2),
+ num_channels=(32, 64, 128),
+ num_heads=[1, 2, 4],
+ mlp_ratios=[4, 4, 4],
+ window_sizes=[7, 7, 7]),
+ stage4=dict(
+ num_modules=2,
+ num_branches=4,
+ block='HRFORMERBLOCK',
+ num_blocks=(2, 2, 2, 2),
+ num_channels=(32, 64, 128, 256),
+ num_heads=[1, 2, 4, 8],
+ mlp_ratios=[4, 4, 4, 4],
+ window_sizes=[7, 7, 7, 7])),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrformer_small-09516375_20220226.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..7d89ef5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..fdb2bae
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py
new file mode 100644
index 0000000..8499f40
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py
@@ -0,0 +1,221 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=3))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# keypoint mappings
+keypoint_mapping_coco = [
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+keypoint_mapping_aic = [
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ (12, 17),
+ (13, 18),
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=19,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ output_keypoint_indices=[
+ target for _, target in keypoint_mapping_coco
+ ]))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_coco)
+ ],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root='data/aic/',
+ data_mode=data_mode,
+ ann_file='annotations/aic_train.json',
+ data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+ 'keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=19,
+ mapping=keypoint_mapping_aic)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py
new file mode 100644
index 0000000..5ac097c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py
@@ -0,0 +1,187 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root='data/aic/',
+ data_mode=data_mode,
+ ann_file='annotations/aic_train.json',
+ data_prefix=dict(img='ai_challenger_keypoint_train_20170902/'
+ 'keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=17,
+ mapping=[
+ (0, 6),
+ (1, 8),
+ (2, 10),
+ (3, 5),
+ (4, 7),
+ (5, 9),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 11),
+ (10, 13),
+ (11, 15),
+ ])
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+ datasets=[dataset_coco, dataset_aic],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..22a9627
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py
@@ -0,0 +1,165 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'body_2d_keypoint/topdown_heatmap/coco/'
+ 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(
+ type='CoarseDropout',
+ max_holes=8,
+ max_height=40,
+ max_width=40,
+ min_holes=1,
+ min_height=10,
+ min_width=10,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..a11db9e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..b2bb6e4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..50188c5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_fp16-8xb64-210e_coco-256x192.py
@@ -0,0 +1,7 @@
+_base_ = ['./td-hm_hrnet-w32_8xb64-210e_coco-256x192.py']
+
+# fp16 settings
+optim_wrapper = dict(
+ type='AmpOptimWrapper',
+ loss_scale='dynamic',
+)
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..5157242
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'body_2d_keypoint/topdown_heatmap/coco/'
+ 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(
+ type='GridDropout',
+ unit_size_min=10,
+ unit_size_max=40,
+ random_offset=True,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..ea82efc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py
@@ -0,0 +1,153 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'body_2d_keypoint/topdown_heatmap/coco/'
+ 'td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..54ae3d9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..3344529
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..f29c8a2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ heatmap_type='combined')
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=3 * 17,
+ deconv_out_channels=None,
+ loss=dict(type='CombinedTargetMSELoss', use_target_weight=True),
+ decoder=codec),
+ train_cfg=dict(compute_acc=False),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='udp_combined',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..5ddd421
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..755f236
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..80ffe8a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..04cd41c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..6cd31a7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..f9edf38
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..f801f7a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(2, 4, 2),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..dd59f59
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(2, 4, 2),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..8b69bbc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(3, 8, 3),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..2aa7f3c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py
@@ -0,0 +1,140 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(3, 8, 3),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..1601819
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='mmcls://mobilenet_v2',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..fcf4ee7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='mmcls://mobilenet_v2',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..b1dbb18
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,152 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MSPN',
+ unit_channels=256,
+ num_stages=1,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ norm_cfg=dict(type='BN'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=1,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3],
+ loss=[
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ],
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..4a8704a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py
@@ -0,0 +1,127 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='PyramidVisionTransformer',
+ num_layers=[3, 4, 6, 3],
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/whai362/PVT/'
+ 'releases/download/v2/pvt_small.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', select_index=3),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=512,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..dd7b5d9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py
@@ -0,0 +1,128 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='PyramidVisionTransformerV2',
+ embed_dims=64,
+ num_layers=[3, 4, 6, 3],
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/whai362/PVT/'
+ 'releases/download/v2/pvt_v2_b2.pth'),
+ ),
+ neck=dict(type='FeatureMapProcessor', select_index=3),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=512,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..25ebf01
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..29f6555
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..3ef9880
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..c8ce6c1
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..b2307b2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..eae41ac
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..4d55253
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..524d999
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py
@@ -0,0 +1,126 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True,
+ blur_kernel_size=17)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..e00887f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..91e8ef2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..07b6031
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..7678941
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..57c8374
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py
@@ -0,0 +1,7 @@
+_base_ = ['./td-hm_res50_8xb64-210e_coco-256x192.py']
+
+# fp16 settings
+optim_wrapper = dict(
+ type='AmpOptimWrapper',
+ loss_scale='dynamic',
+)
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..05be08c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..fb08555
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py
new file mode 100644
index 0000000..48e6992
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=128)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=200,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest200'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..85466b7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=200,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest200'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py
new file mode 100644
index 0000000..5279a43
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=128)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=269,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest269'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..84eb68b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=269,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest269'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..a9fb575
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..b6e3dcb
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeSt',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnest50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..5d90ab7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..f2a66df
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..4821267
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py
new file mode 100644
index 0000000..27281df
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=384)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=48,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..e418369
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..59d3ba6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..eedb64c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=101,
+ init_cfg=dict(
+ type='Pretrained', checkpoint='mmcls://resnext101_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..42487c0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=101,
+ init_cfg=dict(
+ type='Pretrained', checkpoint='mmcls://resnext101_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..82cfeae
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=152,
+ init_cfg=dict(
+ type='Pretrained', checkpoint='mmcls://resnext152_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py
new file mode 100644
index 0000000..2503796
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=384)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=152,
+ init_cfg=dict(
+ type='Pretrained', checkpoint='mmcls://resnext152_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=48,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..2513248
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnext50_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..756010c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnext50_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..7641846
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-2,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 190, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='RSN',
+ unit_channels=256,
+ num_stages=1,
+ num_units=4,
+ num_blocks=[2, 2, 2, 2],
+ num_steps=4,
+ norm_cfg=dict(type='BN'),
+ ),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=1,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3],
+ loss=[
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ],
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..b144cf6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+# multiple kernel_sizes of heatmap gaussian for 'Megvii' approach.
+kernel_sizes = [11, 9, 7, 5]
+codec = [
+ dict(
+ type='MegviiHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ kernel_size=kernel_size) for kernel_size in kernel_sizes
+]
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='RSN',
+ unit_channels=256,
+ num_stages=1,
+ num_units=4,
+ num_blocks=[3, 4, 6, 3],
+ num_steps=4,
+ norm_cfg=dict(type='BN'),
+ ),
+ head=dict(
+ type='MSPNHead',
+ out_shape=(64, 48),
+ unit_channels=256,
+ out_channels=17,
+ num_stages=1,
+ num_units=4,
+ norm_cfg=dict(type='BN'),
+ # each sub list is for a stage
+ # and each element in each list is for a unit
+ level_indices=[0, 1, 2, 3],
+ loss=[
+ dict(
+ type='KeypointMSELoss',
+ use_target_weight=True,
+ loss_weight=0.25)
+ ] * 3 + [
+ dict(
+ type='KeypointOHKMMSELoss',
+ use_target_weight=True,
+ loss_weight=1.)
+ ],
+ decoder=codec[-1]),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='GenerateTarget', multilevel=True, encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec[0]['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json',
+ nms_mode='none')
+test_evaluator = val_evaluator
+
+# fp16 settings
+fp16 = dict(loss_scale='dynamic')
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..d8e49fe
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=101,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet101-94250a77.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=1,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py
new file mode 100644
index 0000000..3281e4a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=101,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet101-94250a77.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=48,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..41071b6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet50-7ef0a199.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=1,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..7355333
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet50-7ef0a199.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..f1bb265
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..d679fc9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..721d4b8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=152,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py
new file mode 100644
index 0000000..94ee1e9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=384)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=152,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=48,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..6ac46fd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..8860772
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..ec7d34b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV1',
+ groups=3,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=960,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..cff10f4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV1',
+ groups=3,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=960,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..59c8109
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV2',
+ widen_factor=1.0,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..d65aa54
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV2',
+ widen_factor=1.0,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..c29257b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py
@@ -0,0 +1,139 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SwinTransformer',
+ embed_dims=128,
+ depths=[2, 2, 18, 2],
+ num_heads=[4, 8, 16, 32],
+ window_size=7,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.3,
+ patch_norm=True,
+ out_indices=(3, ),
+ with_cp=False,
+ convert_weights=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/SwinTransformer/storage/releases/'
+ 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..4bc632a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py
@@ -0,0 +1,139 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SwinTransformer',
+ embed_dims=128,
+ depths=[2, 2, 18, 2],
+ num_heads=[4, 8, 16, 32],
+ window_size=12,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.3,
+ patch_norm=True,
+ out_indices=(3, ),
+ with_cp=False,
+ convert_weights=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/SwinTransformer/storage/releases/'
+ 'download/v1.0.0/swin_base_patch4_window12_384_22k.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..3294263
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py
@@ -0,0 +1,148 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={
+ 'absolute_pos_embed': dict(decay_mult=0.),
+ 'relative_position_bias_table': dict(decay_mult=0.),
+ 'norm': dict(decay_mult=0.)
+ }))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SwinTransformer',
+ embed_dims=192,
+ depths=[2, 2, 18, 2],
+ num_heads=[6, 12, 24, 48],
+ window_size=7,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.5,
+ patch_norm=True,
+ out_indices=(3, ),
+ with_cp=False,
+ convert_weights=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/SwinTransformer/storage/releases/'
+ 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1536,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py
new file mode 100644
index 0000000..643cbc2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py
@@ -0,0 +1,148 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW',
+ lr=5e-4,
+ betas=(0.9, 0.999),
+ weight_decay=0.01,
+ ),
+ paramwise_cfg=dict(
+ custom_keys={
+ 'absolute_pos_embed': dict(decay_mult=0.),
+ 'relative_position_bias_table': dict(decay_mult=0.),
+ 'norm': dict(decay_mult=0.)
+ }))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SwinTransformer',
+ embed_dims=192,
+ depths=[2, 2, 18, 2],
+ num_heads=[6, 12, 24, 48],
+ window_size=7,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.5,
+ patch_norm=True,
+ out_indices=(3, ),
+ with_cp=False,
+ convert_weights=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/SwinTransformer/storage/releases/'
+ 'download/v1.0.0/swin_base_patch4_window12_384_22k.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1536,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py
new file mode 100644
index 0000000..9c4ab23
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py
@@ -0,0 +1,139 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SwinTransformer',
+ embed_dims=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=7,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.2,
+ patch_norm=True,
+ out_indices=(3, ),
+ with_cp=False,
+ convert_weights=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://github.com/SwinTransformer/storage/releases/'
+ 'download/v1.0.0/swin_tiny_patch4_window7_224.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..f50c2b4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='VGG',
+ depth=16,
+ norm_cfg=dict(type='BN'),
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://vgg16_bn'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=512,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..7be5676
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ViPNAS_MobileNetV3'),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=160,
+ out_channels=17,
+ deconv_out_channels=(160, 160, 160),
+ deconv_num_groups=(160, 160, 160),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..9477532
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ViPNAS_ResNet', depth=50),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=608,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md
new file mode 100644
index 0000000..a03c8fc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.md
@@ -0,0 +1,39 @@
+
+
+
+VGG (ICLR'2015)
+
+```bibtex
+@article{simonyan2014very,
+ title={Very deep convolutional networks for large-scale image recognition},
+ author={Simonyan, Karen and Zisserman, Andrew},
+ journal={arXiv preprint arXiv:1409.1556},
+ year={2014}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [vgg](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py) | 256x192 | 0.699 | 0.890 | 0.769 | 0.754 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192_20210517.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml
new file mode 100644
index 0000000..6de1830
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vgg_coco.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - VGG
+ Training Data: COCO
+ Name: td-hm_vgg16-bn_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.699
+ AP@0.5: 0.89
+ AP@0.75: 0.769
+ AR: 0.754
+ AR@0.5: 0.927
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/vgg/vgg16_bn_coco_256x192-7e7c58d6_20210517.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md
new file mode 100644
index 0000000..e138d21
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md
@@ -0,0 +1,40 @@
+
+
+
+ViPNAS (CVPR'2021)
+
+```bibtex
+@article{xu2021vipnas,
+ title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+ author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.700 | 0.887 | 0.783 | 0.758 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192-e0987441_20221010.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192_20221010.log) |
+| [S-ViPNAS-Res50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.711 | 0.894 | 0.787 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192-35d4bff9_20220917.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192_20220917.log) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml
new file mode 100644
index 0000000..66f1819
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.yml
@@ -0,0 +1,40 @@
+Collections:
+- Name: ViPNAS
+ Paper:
+ Title: 'ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search'
+ URL: https://arxiv.org/abs/2105.10154
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/vipnas.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: &id001
+ - ViPNAS
+ Training Data: COCO
+ Name: td-hm_vipnas-mbv3_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.7
+ AP@0.5: 0.887
+ AP@0.75: 0.783
+ AR: 0.758
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ Weights: (https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192-e0987441_20221010.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-hm_vipnas-res50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.711
+ AP@0.5: 0.894
+ AP@0.75: 0.787
+ AR: 0.769
+ AR@0.5: 0.934
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192-35d4bff9_20220917.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
new file mode 100644
index 0000000..b29fd86
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md
@@ -0,0 +1,61 @@
+To utilize ViTPose, you'll need to have [MMPreTrain](https://github.com/open-mmlab/mmpretrain). To install the required version, run the following command:
+
+```shell
+mim install 'mmpretrain>=1.0.0'
+```
+
+
+
+
+ViTPose (NeurIPS'2022)
+
+```bibtex
+@inproceedings{
+ xu2022vitpose,
+ title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
+ author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
+ booktitle={Advances in Neural Information Processing Systems},
+ year={2022},
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) |
+| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.788 | 0.917 | 0.855 | 0.839 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) |
+| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.790 | 0.916 | 0.857 | 0.840 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_3rdparty_coco-256x192-5b738c8e_20230314.pth) | - |
+
+*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose). The config files of these models are only for validation.*
+
+> With simple decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.736 | 0.900 | 0.811 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.json) |
+| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.756 | 0.906 | 0.826 | 0.809 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.json) |
+| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.780 | 0.914 | 0.851 | 0.833 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.json) |
+| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.789 | 0.916 | 0.856 | 0.839 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml
new file mode 100644
index 0000000..fd70420
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml
@@ -0,0 +1,155 @@
+Collections:
+- Name: ViTPose
+ Paper:
+ Title: 'ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation'
+ URL: https://arxiv.org/abs/2204.12484
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/vitpose.md
+ Metadata:
+ Training Resources: 8x A100 GPUs
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: &id001
+ - ViTPose
+ - Classic Head
+ Model Size: Small
+ Training Data: COCO
+ Name: td-hm_ViTPose-small_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.739
+ AP@0.5: 0.903
+ AP@0.75: 0.816
+ AR: 0.792
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Base
+ Training Data: COCO
+ Name: td-hm_ViTPose-base_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.757
+ AP@0.5: 0.905
+ AP@0.75: 0.829
+ AR: 0.81
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Large
+ Training Data: COCO
+ Name: td-hm_ViTPose-large_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.782
+ AP@0.5: 0.914
+ AP@0.75: 0.850
+ AR: 0.834
+ AR@0.5: 0.952
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Huge
+ Training Data: COCO
+ Name: td-hm_ViTPose-huge_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.788
+ AP@0.5: 0.917
+ AP@0.75: 0.855
+ AR: 0.839
+ AR@0.5: 0.954
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Alias: vitpose-s
+ Metadata:
+ Architecture: &id002
+ - ViTPose
+ - Simple Head
+ Model Size: Small
+ Training Data: COCO
+ Name: td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.736
+ AP@0.5: 0.900
+ AP@0.75: 0.811
+ AR: 0.790
+ AR@0.5: 0.940
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Alias:
+ - vitpose
+ - vitpose-b
+ Metadata:
+ Architecture: *id002
+ Model Size: Base
+ Training Data: COCO
+ Name: td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.756
+ AP@0.5: 0.906
+ AP@0.75: 0.826
+ AR: 0.809
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-0b8234ea_20230407.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Alias: vitpose-l
+ Metadata:
+ Architecture: *id002
+ Model Size: Large
+ Training Data: COCO
+ Name: td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.780
+ AP@0.5: 0.914
+ AP@0.75: 0.851
+ AR: 0.833
+ AR@0.5: 0.952
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py
+ In Collection: ViTPose
+ Alias: vitpose-h
+ Metadata:
+ Architecture: *id002
+ Model Size: Huge
+ Training Data: COCO
+ Name: td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.789
+ AP@0.5: 0.916
+ AP@0.75: 0.856
+ AR: 0.839
+ AR@0.5: 0.953
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py
new file mode 100644
index 0000000..1edee28
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py
@@ -0,0 +1,216 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/',
+# f'{data_root}': 's3://openmmlab/datasets/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md
new file mode 100644
index 0000000..734e210
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md
@@ -0,0 +1,56 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [pose_cspnext_m](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py) | 256x192 | 0.662 | 0.821 | 0.723 | 0.759 | 0.675 | 0.539 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml
new file mode 100644
index 0000000..6201813
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.yml
@@ -0,0 +1,20 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture:
+ - UDP
+ - CSPNeXt
+ Training Data: CrowdPose
+ Name: cspnext-m_udp_8xb64-210e_crowpose-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.662
+ AP (E): 0.759
+ AP (H): 0.539
+ AP (M): 0.675
+ AP@0.5: 0.821
+ AP@0.75: 0.723
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md
new file mode 100644
index 0000000..5fdb1aa
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md
@@ -0,0 +1,38 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.675 | 0.825 | 0.729 | 0.770 | 0.687 | 0.553 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192_20201227.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
new file mode 100644
index 0000000..f090812
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
@@ -0,0 +1,19 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture:
+ - HRNet
+ Training Data: CrowdPose
+ Name: td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.675
+ AP (E): 0.77
+ AP (H): 0.553
+ AP (M): 0.687
+ AP@0.5: 0.825
+ AP@0.75: 0.729
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md
new file mode 100644
index 0000000..d987f26
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md
@@ -0,0 +1,58 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+CrowdPose (CVPR'2019)
+
+```bibtex
+@article{li2018crowdpose,
+ title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
+ author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
+ journal={arXiv preprint arXiv:1812.00324},
+ year={2018}
+}
+```
+
+
+
+Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
+
+| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log |
+| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.637 | 0.808 | 0.692 | 0.738 | 0.650 | 0.506 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192_20201227.log.json) |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.647 | 0.810 | 0.703 | 0.745 | 0.658 | 0.521 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192_20201227.log.json) |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py) | 320x256 | 0.661 | 0.821 | 0.714 | 0.759 | 0.672 | 0.534 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256_20201227.log.json) |
+| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.656 | 0.818 | 0.712 | 0.754 | 0.666 | 0.533 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192_20201227.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml
new file mode 100644
index 0000000..15802eb
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.yml
@@ -0,0 +1,71 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: CrowdPose
+ Name: td-hm_res50_8xb64-210e_crowdpose-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.637
+ AP (E): 0.738
+ AP (H): 0.506
+ AP (M): 0.65
+ AP@0.5: 0.808
+ AP@0.75: 0.692
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: td-hm_res101_8xb64-210e_crowdpose-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.647
+ AP (E): 0.745
+ AP (H): 0.521
+ AP (M): 0.658
+ AP@0.5: 0.81
+ AP@0.75: 0.703
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: td-hm_res101_8xb64-210e_crowdpose-320x256
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.661
+ AP (E): 0.759
+ AP (H): 0.534
+ AP (M): 0.672
+ AP@0.5: 0.821
+ AP@0.75: 0.714
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: CrowdPose
+ Name: td-hm_res152_8xb64-210e_crowdpose-256x192
+ Results:
+ - Dataset: CrowdPose
+ Metrics:
+ AP: 0.656
+ AP (E): 0.754
+ AP (H): 0.533
+ AP (M): 0.666
+ AP@0.5: 0.818
+ AP@0.75: 0.712
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py
new file mode 100644
index 0000000..c5ec67a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py
@@ -0,0 +1,152 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=14,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py
new file mode 100644
index 0000000..ef78bbc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py
new file mode 100644
index 0000000..4ffb602
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 320), heatmap_size=(64, 80), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py
new file mode 100644
index 0000000..d53e2d1
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py
new file mode 100644
index 0000000..2ae99ce
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=14,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CrowdPoseDataset'
+data_mode = 'topdown'
+data_root = 'data/crowdpose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mmpose_crowdpose_test.json',
+ bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
+ use_area=False,
+ iou_type='keypoints_crowd',
+ prefix='crowdpose')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md
new file mode 100644
index 0000000..524164e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.md
@@ -0,0 +1,38 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+ExLPose (2023)
+
+```bibtex
+@inproceedings{ExLPose_2023_CVPR,
+ title={Human Pose Estimation in Extremely Low-Light Conditions},
+ author={Sohyun Lee, Jaesung Rim, Boseung Jeong, Geonu Kim, ByungJu Woo, Haechan Lee, Sunghyun Cho, Suha Kwak},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year={2023}
+}
+```
+
+
+
+Results on ExLPose-LLA val set with ground-truth bounding boxes
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py) | 256x192 | 0.401 | 0.64 | 0.40 | 0.452 | 0.693 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml
new file mode 100644
index 0000000..b24cbdd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/hrnet_exlpose.yml
@@ -0,0 +1,18 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture:
+ - HRNet
+ Training Data: ExLPose-LL
+ Name: td-hm_hrnet-w32_8xb64-210e_exlpose-256x192
+ Results:
+ - Dataset: ExLPose
+ Metrics:
+ AP: 0.401
+ AP@0.5: 0.64
+ AP@0.75: 0.40
+ AR: 0.452
+ AR@0.5: 0.693
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-ll-256x192.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py
new file mode 100644
index 0000000..95cfc6f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/exlpose/td-hm_hrnet-w32_8xb64-210e_exlpose-256x192.py
@@ -0,0 +1,149 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=14,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'ExlposeDataset'
+data_mode = 'topdown'
+data_root = 'data/ExLPose/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ExLPose/ExLPose_train_LL.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/ExLPose/ExLPose_test_LL-A.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/ExLPose/ExLPose_test_LL-A.json',
+ use_area=False)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md
new file mode 100644
index 0000000..71b825e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.md
@@ -0,0 +1,80 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+
+Human-Art (CVPR'2023)
+
+```bibtex
+@inproceedings{ju2023humanart,
+ title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
+ author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
+ year={2023}}
+```
+
+
+
+Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.252 | 0.397 | 0.255 | 0.321 | 0.485 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) |
+| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.399 | 0.545 | 0.420 | 0.466 | 0.613 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) |
+| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.271 | 0.413 | 0.277 | 0.339 | 0.499 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.417 | 0.553 | 0.442 | 0.481 | 0.617 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) |
+
+Results on Human-Art validation dataset with ground-truth bounding-box
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.533 | 0.771 | 0.562 | 0.574 | 0.792 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) |
+| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.754 | 0.906 | 0.812 | 0.783 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) |
+| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.557 | 0.782 | 0.593 | 0.595 | 0.804 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.769 | 0.906 | 0.825 | 0.796 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) |
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [pose_hrnet_w32-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) |
+| [pose_hrnet_w32-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py) | 256x192 | 0.741 | 0.902 | 0.814 | 0.795 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.json) |
+| [pose_hrnet_w48-coco](configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py) | 256x192 | 0.756 | 0.908 | 0.826 | 0.809 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192_20220913.log) |
+| [pose_hrnet_w48-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py) | 256x192 | 0.751 | 0.905 | 0.822 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml
new file mode 100644
index 0000000..d49a662
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/hrnet_humanart.yml
@@ -0,0 +1,74 @@
+Collections:
+- Name: HRNet
+ Paper:
+ Title: Deep high-resolution representation learning for human pose estimation
+ URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnet.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: &id002
+ - COCO
+ - Human-Art
+ Name: td-hm_hrnet-w32_8xb64-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.741
+ AP@0.5: 0.902
+ AP@0.75: 0.814
+ AR: 0.795
+ AR@0.5: 0.941
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.399
+ AP@0.5: 0.545
+ AP@0.75: 0.420
+ AR: 0.466
+ AR@0.5: 0.613
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.754
+ AP@0.5: 0.906
+ AP@0.75: 0.812
+ AR: 0.783
+ AR@0.5: 0.916
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w32_8xb64-210e_humanart-256x192-0773ef0b_20230614.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: td-hm_hrnet-w48_8xb32-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.751
+ AP@0.5: 0.905
+ AP@0.75: 0.822
+ AR: 0.805
+ AR@0.5: 0.943
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.417
+ AP@0.5: 0.553
+ AP@0.75: 0.442
+ AR: 0.481
+ AR@0.5: 0.617
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.769
+ AP@0.5: 0.906
+ AP@0.75: 0.825
+ AR: 0.796
+ AR@0.5: 0.919
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_hrnet-w48_8xb32-210e_humanart-256x192-05178983_20230614.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py
new file mode 100644
index 0000000..c28de59
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.75,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch='base',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.3,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_base.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/'
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py
new file mode 100644
index 0000000..92a51d1
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=32,
+ layer_decay_rate=0.85,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmcls.VisionTransformer',
+ arch='huge',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.55,
+ with_cls_token=False,
+ output_cls_token=False,
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_huge.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/'
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py
new file mode 100644
index 0000000..ec7edd2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=24,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmcls.VisionTransformer',
+ arch='large',
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.5,
+ with_cls_token=False,
+ output_cls_token=False,
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_large.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/'
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py
new file mode 100644
index 0000000..ce27e97
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+custom_imports = dict(
+ imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'],
+ allow_failed_imports=False)
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1),
+ paramwise_cfg=dict(
+ num_layers=12,
+ layer_decay_rate=0.8,
+ custom_keys={
+ 'bias': dict(decay_multi=0.0),
+ 'pos_embed': dict(decay_mult=0.0),
+ 'relative_position_bias_table': dict(decay_mult=0.0),
+ 'norm': dict(decay_mult=0.0),
+ },
+ ),
+ constructor='LayerDecayOptimWrapperConstructor',
+ clip_grad=dict(max_norm=1., norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='mmpretrain.VisionTransformer',
+ arch={
+ 'embed_dims': 384,
+ 'num_layers': 12,
+ 'num_heads': 12,
+ 'feedforward_channels': 384 * 4
+ },
+ img_size=(256, 192),
+ patch_size=16,
+ qkv_bias=True,
+ drop_path_rate=0.1,
+ with_cls_token=False,
+ out_type='featmap',
+ patch_cfg=dict(padding=2),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'v1/pretrained_models/mae_pretrain_vit_small.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=384,
+ out_channels=17,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+data_root = 'data/'
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py
new file mode 100644
index 0000000..00bfd37
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w32_8xb64-210e_humanart-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py
new file mode 100644
index 0000000..21269e4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_hrnet-w48_8xb32-210e_humanart-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'HumanArtDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart_coco.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/validation_humanart.json',
+ bbox_file=f'{data_root}HumanArt/person_detection_results/'
+ 'HumanArt_validation_detections_AP_H_56_person.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'HumanArt/annotations/validation_humanart.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md
new file mode 100644
index 0000000..dc0e52f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.md
@@ -0,0 +1,97 @@
+To utilize ViTPose, you'll need to have [MMPreTrain](https://github.com/open-mmlab/mmpretrain). To install the required version, run the following command:
+
+```shell
+mim install 'mmpretrain>=1.0.0'
+```
+
+
+
+
+
+ViTPose (NeurIPS'2022)
+
+```bibtex
+@inproceedings{
+ xu2022vitpose,
+ title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation},
+ author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao},
+ booktitle={Advances in Neural Information Processing Systems},
+ year={2022},
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+
+Human-Art (CVPR'2023)
+
+```bibtex
+@inproceedings{ju2023humanart,
+ title={Human-Art: A Versatile Human-Centric Dataset Bridging Natural and Artificial Scenes},
+ author={Ju, Xuan and Zeng, Ailing and Jianan, Wang and Qiang, Xu and Lei, Zhang},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
+ year={2023}}
+```
+
+
+
+Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.228 | 0.371 | 0.229 | 0.298 | 0.467 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) |
+| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.381 | 0.532 | 0.405 | 0.448 | 0.602 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) |
+| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.270 | 0.423 | 0.272 | 0.340 | 0.510 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) |
+| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.410 | 0.549 | 0.434 | 0.475 | 0.615 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) |
+| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.342 | 0.498 | 0.357 | 0.413 | 0.577 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) |
+| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.459 | 0.592 | 0.487 | 0.525 | 0.656 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) |
+| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.377 | 0.541 | 0.391 | 0.447 | 0.615 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) |
+| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.468 | 0.594 | 0.498 | 0.534 | 0.655 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) |
+
+Results on Human-Art validation dataset with ground-truth bounding-box
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.507 | 0.758 | 0.531 | 0.551 | 0.780 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) |
+| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.738 | 0.905 | 0.802 | 0.768 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) |
+| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.555 | 0.782 | 0.590 | 0.599 | 0.809 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) |
+| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.759 | 0.905 | 0.823 | 0.790 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) |
+| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.637 | 0.838 | 0.689 | 0.677 | 0.859 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) |
+| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.789 | 0.916 | 0.845 | 0.819 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) |
+| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.665 | 0.860 | 0.715 | 0.701 | 0.871 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) |
+| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.800 | 0.926 | 0.855 | 0.828 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) |
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+> With classic decoder
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [ViTPose-S-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) |
+| [ViTPose-S-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py) | 256x192 | 0.737 | 0.902 | 0.811 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.json) |
+| [ViTPose-B-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) |
+| [ViTPose-B-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.758 | 0.906 | 0.829 | 0.812 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.json) |
+| [ViTPose-L-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) |
+| [ViTPose-L-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py) | 256x192 | 0.782 | 0.914 | 0.849 | 0.835 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.json) |
+| [ViTPose-H-coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.788 | 0.917 | 0.855 | 0.839 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) |
+| [ViTPose-H-humanart-coco](configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py) | 256x192 | 0.788 | 0.914 | 0.853 | 0.841 | 0.956 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml
new file mode 100644
index 0000000..2d2ba30
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/humanart/vitpose_humanart.yml
@@ -0,0 +1,145 @@
+Collections:
+- Name: ViTPose
+ Paper:
+ Title: 'ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation'
+ URL: https://arxiv.org/abs/2204.12484
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/vitpose.md
+ Metadata:
+ Training Resources: 8x A100 GPUs
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-small_8xb64-210e_humanart-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: &id001
+ - ViTPose
+ - Classic Head
+ Model Size: Small
+ Training Data: &id002
+ - COCO
+ - Human-Art
+ Name: td-hm_ViTPose-small_8xb64-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.737
+ AP@0.5: 0.902
+ AP@0.75: 0.811
+ AR: 0.792
+ AR@0.5: 0.942
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.381
+ AP@0.5: 0.532
+ AP@0.75: 0.405
+ AR: 0.448
+ AR@0.5: 0.602
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.738
+ AP@0.5: 0.905
+ AP@0.75: 0.802
+ AR: 0.768
+ AR@0.5: 0.911
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-small_8xb64-210e_humanart-256x192-5cbe2bfc_20230611.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-base_8xb64-210e_humanart-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Base
+ Training Data: *id002
+ Name: td-hm_ViTPose-base_8xb64-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.758
+ AP@0.5: 0.906
+ AP@0.75: 0.829
+ AR: 0.812
+ AR@0.5: 0.946
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.410
+ AP@0.5: 0.549
+ AP@0.75: 0.434
+ AR: 0.475
+ AR@0.5: 0.615
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.759
+ AP@0.5: 0.905
+ AP@0.75: 0.823
+ AR: 0.790
+ AR@0.5: 0.917
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-base_8xb64-210e_humanart-256x192-b417f546_20230611.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-large_8xb64-210e_humanart-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Large
+ Training Data: *id002
+ Name: td-hm_ViTPose-large_8xb64-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.782
+ AP@0.5: 0.914
+ AP@0.75: 0.849
+ AR: 0.835
+ AR@0.5: 0.953
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.459
+ AP@0.5: 0.592
+ AP@0.75: 0.487
+ AR: 0.525
+ AR@0.5: 0.656
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.789
+ AP@0.5: 0.916
+ AP@0.75: 0.845
+ AR: 0.819
+ AR@0.5: 0.929
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-large_8xb64-210e_humanart-256x192-9aba9345_20230614.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/humanart/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192.py
+ In Collection: ViTPose
+ Metadata:
+ Architecture: *id001
+ Model Size: Huge
+ Training Data: *id002
+ Name: td-hm_ViTPose-huge_8xb64-210e_humanart-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.788
+ AP@0.5: 0.914
+ AP@0.75: 0.853
+ AR: 0.841
+ AR@0.5: 0.956
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art
+ Metrics:
+ AP: 0.468
+ AP@0.5: 0.594
+ AP@0.75: 0.498
+ AR: 0.534
+ AR@0.5: 0.655
+ Task: Body 2D Keypoint
+ - Dataset: Human-Art(GT)
+ Metrics:
+ AP: 0.800
+ AP@0.5: 0.926
+ AP@0.75: 0.855
+ AR: 0.828
+ AR@0.5: 0.933
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/human_art/td-hm_ViTPose-huge_8xb64-210e_humanart-256x192-603bb573_20230612.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md
new file mode 100644
index 0000000..bb19451
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.md
@@ -0,0 +1,56 @@
+
+
+
+CPM (CVPR'2016)
+
+```bibtex
+@inproceedings{wei2016convolutional,
+ title={Convolutional pose machines},
+ author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+ booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+ pages={4724--4732},
+ year={2016}
+}
+```
+
+
+
+
+
+
+JHMDB (ICCV'2013)
+
+```bibtex
+@inproceedings{Jhuang:ICCV:2013,
+ title = {Towards understanding action recognition},
+ author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+ booktitle = {International Conf. on Computer Vision (ICCV)},
+ month = Dec,
+ pages = {3192-3199},
+ year = {2013}
+}
+```
+
+
+
+Results on Sub-JHMDB dataset
+
+The models are pre-trained on MPII dataset only. NO test-time augmentation (multi-scale /rotation testing) is used.
+
+- Normalized by Person Size
+
+| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log |
+| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: |
+| Sub1 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py) | 368x368 | 96.1 | 91.9 | 81.0 | 78.9 | 96.6 | 90.8 | 87.3 | 89.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) |
+| Sub2 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py) | 368x368 | 98.1 | 93.6 | 77.1 | 70.9 | 94.0 | 89.1 | 84.7 | 87.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) |
+| Sub3 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py) | 368x368 | 97.9 | 94.9 | 87.3 | 84.0 | 98.6 | 94.4 | 86.2 | 92.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) |
+| Average | cpm | 368x368 | 97.4 | 93.5 | 81.5 | 77.9 | 96.4 | 91.4 | 86.1 | 89.8 | - | - |
+
+- Normalized by Torso Size
+
+| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log |
+| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: |
+| Sub1 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py) | 368x368 | 89.0 | 63.0 | 54.0 | 54.9 | 68.2 | 63.1 | 61.2 | 66.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368_20201122.log.json) |
+| Sub2 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py) | 368x368 | 90.3 | 57.9 | 46.8 | 44.3 | 60.8 | 58.2 | 62.4 | 61.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368_20201122.log.json) |
+| Sub3 | [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py) | 368x368 | 91.0 | 72.6 | 59.9 | 54.0 | 73.2 | 68.5 | 65.8 | 70.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368_20201122.log.json) |
+| Average | cpm | 368x368 | 90.1 | 64.5 | 53.6 | 51.1 | 67.4 | 63.3 | 63.1 | 65.7 | - | - |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml
new file mode 100644
index 0000000..f923d5b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/cpm_jhmdb.yml
@@ -0,0 +1,116 @@
+Models:
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: &id001
+ - CPM
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 87.3
+ Elb: 81
+ Head: 96.1
+ Hip: 96.6
+ Knee: 90.8
+ Mean: 89.5
+ Sho: 91.9
+ Wri: 78.9
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 84.7
+ Elb: 77.1
+ Head: 98.1
+ Hip: 94.0
+ Knee: 89.1
+ Mean: 87.4
+ Sho: 93.6
+ Wri: 70.9
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 86.2
+ Elb: 87.3
+ Head: 97.9
+ Hip: 98.6
+ Knee: 94.4
+ Mean: 92.4
+ Sho: 94.9
+ Wri: 84.0
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 61.2
+ Elb: 54.0
+ Head: 89.0
+ Hip: 68.2
+ Knee: 63.1
+ Mean: 66.0
+ Sho: 63.0
+ Wri: 54.9
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 62.4
+ Elb: 46.8
+ Head: 90.3
+ Hip: 60.8
+ Knee: 58.2
+ Mean: 61.1
+ Sho: 57.9
+ Wri: 44.3
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub2_368x368-fc742f1f_20201122.pth
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 65.8
+ Elb: 59.9
+ Head: 91.0
+ Hip: 73.2
+ Knee: 68.5
+ Mean: 70.3
+ Sho: 72.6
+ Wri: 54.0
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub3_368x368-49337155_20201122.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md
new file mode 100644
index 0000000..d82672f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.md
@@ -0,0 +1,81 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+JHMDB (ICCV'2013)
+
+```bibtex
+@inproceedings{Jhuang:ICCV:2013,
+ title = {Towards understanding action recognition},
+ author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
+ booktitle = {International Conf. on Computer Vision (ICCV)},
+ month = Dec,
+ pages = {3192-3199},
+ year = {2013}
+}
+```
+
+
+
+Results on Sub-JHMDB dataset
+
+The models are pre-trained on MPII dataset only. *NO* test-time augmentation (multi-scale /rotation testing) is used.
+
+- Normalized by Person Size
+
+| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log |
+| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: |
+| Sub1 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py) | 256x256 | 99.1 | 98.0 | 93.8 | 91.3 | 99.4 | 96.5 | 92.8 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py) | 256x256 | 99.3 | 97.1 | 90.6 | 87.0 | 98.9 | 96.3 | 94.1 | 95.0 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py) | 256x256 | 99.0 | 97.9 | 94.0 | 91.6 | 99.7 | 98.0 | 94.7 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) |
+| Average | pose_resnet_50 | 256x256 | 99.2 | 97.7 | 92.8 | 90.0 | 99.3 | 96.9 | 93.9 | 96.0 | - | - |
+| Sub1 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py) | 256x256 | 99.1 | 98.5 | 94.6 | 92.0 | 99.4 | 94.6 | 92.5 | 96.1 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py) | 256x256 | 99.3 | 97.8 | 91.0 | 87.0 | 99.1 | 96.5 | 93.8 | 95.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py) | 256x256 | 98.8 | 98.4 | 94.3 | 92.1 | 99.8 | 97.5 | 93.8 | 96.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) |
+| Average | pose_resnet_50 (2 Deconv.) | 256x256 | 99.1 | 98.2 | 93.3 | 90.4 | 99.4 | 96.2 | 93.4 | 96.0 | - | - |
+
+- Normalized by Torso Size
+
+| Split | Arch | Input Size | Head | Sho | Elb | Wri | Hip | Knee | Ank | Mean | ckpt | log |
+| :------ | :------------------------------------------------: | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-------------------------------------------------: | :------------------------------------------------: |
+| Sub1 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py) | 256x256 | 93.3 | 83.2 | 74.4 | 72.7 | 85.0 | 81.2 | 78.9 | 81.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py) | 256x256 | 94.1 | 74.9 | 64.5 | 62.5 | 77.9 | 71.9 | 78.6 | 75.5 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 | [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py) | 256x256 | 97.0 | 82.2 | 74.9 | 70.7 | 84.7 | 83.7 | 84.2 | 82.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256_20201122.log.json) |
+| Average | pose_resnet_50 | 256x256 | 94.8 | 80.1 | 71.3 | 68.6 | 82.5 | 78.9 | 80.6 | 80.1 | - | - |
+| Sub1 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py) | 256x256 | 92.4 | 80.6 | 73.2 | 70.5 | 82.3 | 75.4 | 75.0 | 79.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256_20201122.log.json) |
+| Sub2 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py) | 256x256 | 93.4 | 73.6 | 63.8 | 60.5 | 75.1 | 68.4 | 75.5 | 73.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256_20201122.log.json) |
+| Sub3 | [pose_resnet_50 (2 Deconv.)](/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py) | 256x256 | 96.1 | 81.2 | 72.6 | 67.9 | 83.6 | 80.9 | 81.5 | 81.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256_20201122.log.json) |
+| Average | pose_resnet_50 (2 Deconv.) | 256x256 | 94.0 | 78.5 | 69.9 | 66.3 | 80.3 | 74.9 | 77.3 | 78.0 | - | - |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml
new file mode 100644
index 0000000..a4a9de3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/resnet_jhmdb.yml
@@ -0,0 +1,231 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub1-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 92.8
+ Elb: 93.8
+ Head: 99.1
+ Hip: 99.4
+ Knee: 96.5
+ Mean: 96.1
+ Sho: 98.0
+ Wri: 91.3
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub2-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 94.1
+ Elb: 90.6
+ Head: 99.3
+ Hip: 98.9
+ Knee: 96.3
+ Mean: 95.0
+ Sho: 97.1
+ Wri: 87.0
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub3-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 94.7
+ Elb: 94.0
+ Head: 99.0
+ Hip: 99.7
+ Knee: 98.0
+ Mean: 96.7
+ Sho: 97.9
+ Wri: 91.6
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 92.5
+ Elb: 94.6
+ Head: 99.1
+ Hip: 99.4
+ Knee: 94.6
+ Mean: 96.1
+ Sho: 98.5
+ Wri: 92.0
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 93.8
+ Elb: 91.0
+ Head: 99.3
+ Hip: 99.1
+ Knee: 96.5
+ Mean: 95.2
+ Sho: 97.8
+ Wri: 87.0
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 93.8
+ Elb: 94.3
+ Head: 98.8
+ Hip: 99.8
+ Knee: 97.5
+ Mean: 96.7
+ Sho: 98.4
+ Wri: 92.1
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub1-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 78.9
+ Elb: 74.4
+ Head: 93.3
+ Hip: 85.0
+ Knee: 81.2
+ Mean: 81.9
+ Sho: 83.2
+ Wri: 72.7
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub1_256x256-932cb3b4_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub2-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 78.6
+ Elb: 64.5
+ Head: 94.1
+ Hip: 77.9
+ Knee: 71.9
+ Mean: 75.5
+ Sho: 74.9
+ Wri: 62.5
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub2_256x256-83d606f7_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50_8xb64-20e_jhmdb-sub3-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 84.2
+ Elb: 74.9
+ Head: 97.0
+ Hip: 84.7
+ Knee: 83.7
+ Mean: 82.9
+ Sho: 82.2
+ Wri: 70.7
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_jhmdb_sub3_256x256-c4ec1a0b_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 75.0
+ Elb: 73.2
+ Head: 92.4
+ Hip: 82.3
+ Knee: 75.4
+ Mean: 79.2
+ Sho: 80.6
+ Wri: 70.5
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub1_256x256-f0574a52_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 75.5
+ Elb: 63.8
+ Head: 93.4
+ Hip: 75.1
+ Knee: 68.4
+ Mean: 73.7
+ Sho: 73.6
+ Wri: 60.5
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub2_256x256-f63af0ff_20201122.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: JHMDB
+ Name: td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256
+ Results:
+ - Dataset: JHMDB
+ Metrics:
+ Ank: 81.5
+ Elb: 72.6
+ Head: 96.1
+ Hip: 83.6
+ Knee: 80.9
+ Mean: 81.2
+ Sho: 81.2
+ Wri: 67.9
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_2deconv_jhmdb_sub3_256x256-c4bc2ddb_20201122.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py
new file mode 100644
index 0000000..fb59f0a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py
@@ -0,0 +1,127 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=15,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=15,
+ out_channels=15,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py
new file mode 100644
index 0000000..84875ca
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py
@@ -0,0 +1,127 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=15,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=15,
+ out_channels=15,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py
new file mode 100644
index 0000000..9995cbf
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py
@@ -0,0 +1,127 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=15,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=15,
+ out_channels=15,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py
new file mode 100644
index 0000000..8eba9a4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py
new file mode 100644
index 0000000..627f74e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py
new file mode 100644
index 0000000..c61e18b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=40, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=40,
+ milestones=[20, 30],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(32, 32), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ deconv_out_channels=(256, 256),
+ deconv_kernel_sizes=(4, 4),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py
new file mode 100644
index 0000000..2bb5068
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[8, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub1_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py
new file mode 100644
index 0000000..3cdcaff
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[8, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub2_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py
new file mode 100644
index 0000000..151a2a3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[8, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ResNet', depth=50),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=15,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth' # noqa: E501
+
+# base dataset settings
+dataset_type = 'JhmdbDataset'
+data_mode = 'topdown'
+data_root = 'data/jhmdb/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/Sub3_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md
new file mode 100644
index 0000000..ac25f9c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+CPM (CVPR'2016)
+
+```bibtex
+@inproceedings{wei2016convolutional,
+ title={Convolutional pose machines},
+ author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
+ booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+ pages={4724--4732},
+ year={2016}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [cpm](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py) | 368x368 | 0.876 | 0.285 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368_20200822.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml
new file mode 100644
index 0000000..077e0cb
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cpm_mpii.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py
+ In Collection: CPM
+ Metadata:
+ Architecture:
+ - CPM
+ Training Data: MPII
+ Name: td-hm_cpm_8xb64-210e_mpii-368x368
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.876
+ Mean@0.1: 0.285
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_mpii_368x368-116e62b8_20200822.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..d9c4552
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,210 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 210 to 420 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=1024)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/',
+# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md
new file mode 100644
index 0000000..1256ae9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md
@@ -0,0 +1,57 @@
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py) | 256x256 | 0.902 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml
new file mode 100644
index 0000000..e1c738c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py
+ In Collection: UDP
+ Metadata:
+ Architecture:
+ - UDP
+ - CSPNeXt
+ Training Data: MPII
+ Name: cspnext-m_udp_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.902
+ Mean@0.1: 0.303
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md
new file mode 100644
index 0000000..ca29dc2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.md
@@ -0,0 +1,41 @@
+
+
+
+Hourglass (ECCV'2016)
+
+```bibtex
+@inproceedings{newell2016stacked,
+ title={Stacked hourglass networks for human pose estimation},
+ author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+ booktitle={European conference on computer vision},
+ pages={483--499},
+ year={2016},
+ organization={Springer}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py) | 256x256 | 0.889 | 0.317 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256_20200812.log.json) |
+| [pose_hourglass_52](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py) | 384x384 | 0.894 | 0.367 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml
new file mode 100644
index 0000000..17a5c3c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hourglass_mpii.yml
@@ -0,0 +1,28 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture: &id001
+ - Hourglass
+ Training Data: MPII
+ Name: td-hm_hourglass52_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.889
+ Mean@0.1: 0.317
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_256x256-ae358435_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_hourglass52_8xb32-210e_mpii-384x384
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.894
+ Mean@0.1: 0.367
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_mpii_384x384-04090bc3_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md
new file mode 100644
index 0000000..5a089f2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.md
@@ -0,0 +1,57 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_hrnet_w32_dark](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py) | 256x256 | 0.904 | 0.354 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark_20200927.log.json) |
+| [pose_hrnet_w48_dark](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py) | 256x256 | 0.905 | 0.360 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark_20200927.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml
new file mode 100644
index 0000000..1f19ecf
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_dark_mpii.yml
@@ -0,0 +1,29 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ - DarkPose
+ Training Data: MPII
+ Name: td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.904
+ Mean@0.1: 0.354
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_dark-f1601c5b_20200927.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.905
+ Mean@0.1: 0.36
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_dark-0decd39f_20200927.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md
new file mode 100644
index 0000000..c8ea9e3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.md
@@ -0,0 +1,40 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py) | 256x256 | 0.900 | 0.334 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256_20200812.log.json) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py) | 256x256 | 0.901 | 0.337 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml
new file mode 100644
index 0000000..b2ead58
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/hrnet_mpii.yml
@@ -0,0 +1,28 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: MPII
+ Name: td-hm_hrnet-w32_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.9
+ Mean@0.1: 0.334
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_mpii_256x256-6c4f923f_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_hrnet-w48_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.901
+ Mean@0.1: 0.337
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_mpii_256x256-92cab7bd_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md
new file mode 100644
index 0000000..21211e6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+LiteHRNet (CVPR'2021)
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+ title={Lite-HRNet: A Lightweight High-Resolution Network},
+ author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+ booktitle={CVPR},
+ year={2021}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [LiteHRNet-18](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py) | 256x256 | 0.859 | 0.260 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256_20210623.log.json) |
+| [LiteHRNet-30](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py) | 256x256 | 0.869 | 0.271 | [ckpt](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth) | [log](https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256_20210622.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml
new file mode 100644
index 0000000..940eaf6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/litehrnet_mpii.yml
@@ -0,0 +1,28 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: &id001
+ - LiteHRNet
+ Training Data: MPII
+ Name: td-hm_litehrnet-18_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.859
+ Mean@0.1: 0.26
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet18_mpii_256x256-cabd7984_20210623.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_litehrnet-30_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.869
+ Mean@0.1: 0.271
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/litehrnet/litehrnet30_mpii_256x256-faae8bd8_20210622.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md
new file mode 100644
index 0000000..6343855
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py) | 256x256 | 0.854 | 0.234 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml
new file mode 100644
index 0000000..09d65dd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/mobilenetv2_mpii.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: MPII
+ Name: td-hm_mobilenetv2_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.854
+ Mean@0.1: 0.234
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_mpii_256x256-e068afa7_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md
new file mode 100644
index 0000000..790746b
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.md
@@ -0,0 +1,58 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.882 | 0.286 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256_20200812.log.json) |
+| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256_20200812.log.json) |
+| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.889 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml
new file mode 100644
index 0000000..14ae910
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnet_mpii.yml
@@ -0,0 +1,42 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: MPII
+ Name: td-hm_res50_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.882
+ Mean@0.1: 0.286
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_mpii_256x256-418ffc88_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_res101_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.888
+ Mean@0.1: 0.29
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_mpii_256x256-416f5d71_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_res152_8xb32-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.889
+ Mean@0.1: 0.303
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_mpii_256x256-3ecba29d_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md
new file mode 100644
index 0000000..09ffe42
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.md
@@ -0,0 +1,41 @@
+
+
+
+ResNetV1D (CVPR'2019)
+
+```bibtex
+@inproceedings{he2019bag,
+ title={Bag of tricks for image classification with convolutional neural networks},
+ author={He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={558--567},
+ year={2019}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.881 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256_20200812.log.json) |
+| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.883 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256_20200812.log.json) |
+| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.300 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml
new file mode 100644
index 0000000..b6c902d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnetv1d_mpii.yml
@@ -0,0 +1,42 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNetV1D
+ Training Data: MPII
+ Name: td-hm_resnetv1d50_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.881
+ Mean@0.1: 0.29
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_mpii_256x256-2337a92e_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_resnetv1d101_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.883
+ Mean@0.1: 0.295
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_mpii_256x256-2851d710_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_resnetv1d152_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.888
+ Mean@0.1: 0.3
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_mpii_256x256-8b10a87c_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md
new file mode 100644
index 0000000..64eb483
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+ResNext (CVPR'2017)
+
+```bibtex
+@inproceedings{xie2017aggregated,
+ title={Aggregated residual transformations for deep neural networks},
+ author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1492--1500},
+ year={2017}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_resnext_152](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.887 | 0.294 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256_20200927.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml
new file mode 100644
index 0000000..feb338e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/resnext_mpii.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNext
+ Training Data: MPII
+ Name: td-hm_resnext152_8xb32-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.887
+ Mean@0.1: 0.294
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnext/resnext152_mpii_256x256-df302719_20200927.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md
new file mode 100644
index 0000000..eaa8a64
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.md
@@ -0,0 +1,40 @@
+
+
+
+SCNet (CVPR'2020)
+
+```bibtex
+@inproceedings{liu2020improving,
+ title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+ author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={10096--10105},
+ year={2020}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_scnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.888 | 0.290 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256_20200812.log.json) |
+| [pose_scnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.887 | 0.293 | [ckpt](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256_20200812.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml
new file mode 100644
index 0000000..d132448
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/scnet_mpii.yml
@@ -0,0 +1,29 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - SCNet
+ Training Data: MPII
+ Name: td-hm_scnet50_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.888
+ Mean@0.1: 0.29
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet50_mpii_256x256-a54b6af5_20200812.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_scnet101_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.887
+ Mean@0.1: 0.293
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/scnet/scnet101_mpii_256x256-b4c2d184_20200812.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md
new file mode 100644
index 0000000..812fd70
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.md
@@ -0,0 +1,43 @@
+
+
+
+SEResNet (CVPR'2018)
+
+```bibtex
+@inproceedings{hu2018squeeze,
+ title={Squeeze-and-excitation networks},
+ author={Hu, Jie and Shen, Li and Sun, Gang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={7132--7141},
+ year={2018}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_seresnet_50](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.292 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256_20200927.log.json) |
+| [pose_seresnet_101](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.295 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256_20200927.log.json) |
+| [pose_seresnet_152\*](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py) | 256x256 | 0.884 | 0.287 | [ckpt](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth) | [log](https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256_20200927.log.json) |
+
+Note that * means without imagenet pre-training.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml
new file mode 100644
index 0000000..8d6a3e4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/seresnet_mpii.yml
@@ -0,0 +1,42 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - SEResNet
+ Training Data: MPII
+ Name: td-hm_seresnet50_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.884
+ Mean@0.1: 0.292
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet50_mpii_256x256-1bb21f79_20200927.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_seresnet101_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.884
+ Mean@0.1: 0.295
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet101_mpii_256x256-0ba14ff5_20200927.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-hm_seresnet152_8xb32-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.884
+ Mean@0.1: 0.287
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/seresnet/seresnet152_mpii_256x256-6ea1e774_20200927.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md
new file mode 100644
index 0000000..b8ccb8c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+ShufflenetV1 (CVPR'2018)
+
+```bibtex
+@inproceedings{zhang2018shufflenet,
+ title={Shufflenet: An extremely efficient convolutional neural network for mobile devices},
+ author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={6848--6856},
+ year={2018}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py) | 256x256 | 0.824 | 0.195 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256_20200925.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml
new file mode 100644
index 0000000..66d6e4e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv1_mpii.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ShufflenetV1
+ Training Data: MPII
+ Name: td-hm_shufflenetv1_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.824
+ Mean@0.1: 0.195
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_mpii_256x256-dcc1c896_20200925.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md
new file mode 100644
index 0000000..7f13623
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.md
@@ -0,0 +1,39 @@
+
+
+
+ShufflenetV2 (ECCV'2018)
+
+```bibtex
+@inproceedings{ma2018shufflenet,
+ title={Shufflenet v2: Practical guidelines for efficient cnn architecture design},
+ author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={116--131},
+ year={2018}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py) | 256x256 | 0.828 | 0.205 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256_20200925.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml
new file mode 100644
index 0000000..71ff431
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/shufflenetv2_mpii.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ShufflenetV2
+ Training Data: MPII
+ Name: td-hm_shufflenetv2_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.828
+ Mean@0.1: 0.205
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_mpii_256x256-4fb9df2d_20200925.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py
new file mode 100644
index 0000000..cf47ecd
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(368, 368), heatmap_size=(46, 46), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CPM',
+ in_channels=3,
+ out_channels=16,
+ feat_channels=128,
+ num_stages=6),
+ head=dict(
+ type='CPMHead',
+ in_channels=16,
+ out_channels=16,
+ num_stages=6,
+ deconv_out_channels=None,
+ final_layer=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py
new file mode 100644
index 0000000..1754065
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(384, 384), heatmap_size=(96, 96), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=16,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..07f13ce
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=16,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..7ee018d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..f22c0f8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..3101359
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,146 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..9435d79
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..a95e33d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,137 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(2, 4, 2),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..a7b4400
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,137 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(3, 8, 3),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=16,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..6b40e19
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..0bd5fd8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py
new file mode 100644
index 0000000..a2d86f8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..d22c405
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..25c4087
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet101_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..ce43cf3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet152_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..a2853f8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNetV1d',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://resnet50_v1d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py
new file mode 100644
index 0000000..8bfe3ef
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNeXt',
+ depth=152,
+ init_cfg=dict(
+ type='Pretrained', checkpoint='mmcls://resnext152_32x4d'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..9ae0c8d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=101,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet101-94250a77.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..6e2206a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet50-7ef0a199.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..7ead848
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py
new file mode 100644
index 0000000..7c2486d
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py
@@ -0,0 +1,116 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=152,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..c14ba34
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SEResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://se-resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..1b8ac62
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV1',
+ groups=3,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v1'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=960,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..e39aff8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ShuffleNetV2',
+ widen_factor=1.0,
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://shufflenet_v2'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=16,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file='data/mpii/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md
new file mode 100644
index 0000000..a6c5899
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.md
@@ -0,0 +1,55 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+PoseTrack18 (CVPR'2018)
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+ title={Posetrack: A benchmark for human pose estimation and tracking},
+ author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={5167--5176},
+ year={2018}
+}
+```
+
+
+
+Results on PoseTrack2018 val with ground-truth bounding boxes
+
+| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log |
+| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py) | 256x192 | 86.2 | 89.0 | 84.5 | 79.2 | 82.3 | 82.5 | 78.7 | 83.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py) | 384x288 | 87.1 | 89.0 | 85.1 | 80.2 | 80.6 | 82.8 | 79.6 | 83.7 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py) | 256x192 | 88.3 | 90.2 | 86.0 | 81.0 | 80.7 | 83.3 | 80.6 | 84.6 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py) | 384x288 | 87.8 | 90.0 | 86.2 | 81.3 | 81.0 | 83.4 | 80.9 | 84.6 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
+
+Results on PoseTrack2018 val with [MMDetection](https://github.com/open-mmlab/mmdetection) pre-trained [Cascade R-CNN](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) (X-101-64x4d-FPN) human detector
+
+| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log |
+| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py) | 256x192 | 78.0 | 82.9 | 79.5 | 73.8 | 76.9 | 76.6 | 70.2 | 76.9 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192_20201028.log.json) |
+| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py) | 384x288 | 79.9 | 83.6 | 80.4 | 74.5 | 74.8 | 76.1 | 70.5 | 77.3 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py) | 256x192 | 80.1 | 83.4 | 80.6 | 74.8 | 74.3 | 76.8 | 70.5 | 77.4 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192_20211130.log.json) |
+| [pose_hrnet_w48](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py) | 384x288 | 80.2 | 83.8 | 80.9 | 75.2 | 74.7 | 76.7 | 71.7 | 77.8 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288_20211130.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
new file mode 100644
index 0000000..c2a0787
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
@@ -0,0 +1,154 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 78.7
+ Elb: 84.5
+ Head: 86.2
+ Hip: 82.3
+ Knee: 82.5
+ Shou: 89
+ Total: 83.4
+ Wri: 79.2
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 79.6
+ Elb: 84.5
+ Head: 87.1
+ Hip: 80.6
+ Knee: 82.8
+ Shou: 89
+ Total: 83.7
+ Wri: 80.2
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 79.6
+ Elb: 85.1
+ Head: 88.3
+ Hip: 80.6
+ Knee: 82.8
+ Shou: 90.2
+ Total: 84.6
+ Wri: 81
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 80.6
+ Elb: 86.2
+ Head: 87.8
+ Hip: 81
+ Knee: 83.4
+ Shou: 90
+ Total: 84.6
+ Wri: 81.3
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 70.2
+ Elb: 79.5
+ Head: 78.0
+ Hip: 76.9
+ Knee: 76.6
+ Shou: 82.9
+ Total: 76.9
+ Wri: 73.8
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_256x192-1ee951c4_20201028.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 70.5
+ Elb: 80.4
+ Head: 79.9
+ Hip: 74.8
+ Knee: 76.1
+ Shou: 83.6
+ Total: 77.3
+ Wri: 74.5
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_posetrack18_384x288-806f00a3_20211130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 70.4
+ Elb: 80.6
+ Head: 80.1
+ Hip: 74.3
+ Knee: 76.8
+ Shou: 83.4
+ Total: 77.4
+ Wri: 74.8
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_256x192-b5d9b3f1_20211130.pth
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: PoseTrack18
+ Name: td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 71.7
+ Elb: 80.9
+ Head: 80.2
+ Hip: 74.7
+ Knee: 76.7
+ Shou: 83.8
+ Total: 77.8
+ Wri: 75.2
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_posetrack18_384x288-5fd6d3ff_20211130.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md
new file mode 100644
index 0000000..e172780
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.md
@@ -0,0 +1,58 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+PoseTrack18 (CVPR'2018)
+
+```bibtex
+@inproceedings{andriluka2018posetrack,
+ title={Posetrack: A benchmark for human pose estimation and tracking},
+ author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={5167--5176},
+ year={2018}
+}
+```
+
+
+
+Results on PoseTrack2018 val with ground-truth bounding boxes
+
+| Arch | Input Size | Head | Shou | Elb | Wri | Hip | Knee | Ankl | Total | ckpt | log |
+| :--------------------------------------------------- | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :---: | :---------------------------------------------------: | :--------------------------------------------------: |
+| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py) | 256x192 | 86.5 | 87.7 | 82.5 | 75.8 | 80.1 | 78.8 | 74.2 | 81.2 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192_20201028.log.json) |
+
+The models are first pre-trained on COCO dataset, and then fine-tuned on PoseTrack18.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml
new file mode 100644
index 0000000..a15fa9f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/resnet_posetrack18.yml
@@ -0,0 +1,22 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: PoseTrack18
+ Name: td-hm_res50_8xb64-20e_posetrack18-256x192
+ Results:
+ - Dataset: PoseTrack18
+ Metrics:
+ Ankl: 74.2
+ Elb: 82.5
+ Head: 86.5
+ Hip: 80.1
+ Knee: 78.8
+ Shou: 87.7
+ Total: 81.2
+ Wri: 75.8
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_posetrack18_256x192-a62807c7_20201028.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py
new file mode 100644
index 0000000..63e35a4
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[10, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='posetrack18/Total AP', rule='greater', interval=1))
+
+# load from the pretrained model
+load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth' # noqa: E501
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'PoseTrack18Dataset'
+data_mode = 'topdown'
+data_root = 'data/posetrack18/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_val.json',
+ # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation
+ bbox_file='data/posetrack18/annotations/'
+ 'posetrack18_val_human_detections.json',
+ filter_cfg=dict(bbox_score_thr=0.4),
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='PoseTrack18Metric',
+ ann_file=data_root + 'annotations/posetrack18_val.json',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py
new file mode 100644
index 0000000..04a4522
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[10, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='posetrack18/Total AP', rule='greater', interval=1))
+
+# load from the pretrained model
+load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth' # noqa: E501
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'PoseTrack18Dataset'
+data_mode = 'topdown'
+data_root = 'data/posetrack18/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_val.json',
+ # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation
+ bbox_file='data/posetrack18/annotations/'
+ 'posetrack18_val_human_detections.json',
+ filter_cfg=dict(bbox_score_thr=0.4),
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='PoseTrack18Metric',
+ ann_file=data_root + 'annotations/posetrack18_val.json',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py
new file mode 100644
index 0000000..90e81d0
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[10, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='posetrack18/Total AP', rule='greater', interval=1))
+
+# load from the pretrained model
+load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth' # noqa: E501
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'PoseTrack18Dataset'
+data_mode = 'topdown'
+data_root = 'data/posetrack18/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_val.json',
+ # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation
+ bbox_file='data/posetrack18/annotations/'
+ 'posetrack18_val_human_detections.json',
+ filter_cfg=dict(bbox_score_thr=0.4),
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='PoseTrack18Metric',
+ ann_file=data_root + 'annotations/posetrack18_val.json',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py
new file mode 100644
index 0000000..32189ff
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py
@@ -0,0 +1,155 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[10, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='posetrack18/Total AP', rule='greater', interval=1))
+
+# load from the pretrained model
+load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth' # noqa: E501
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=17,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'PoseTrack18Dataset'
+data_mode = 'topdown'
+data_root = 'data/posetrack18/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_val.json',
+ # comment `bbox_file` and '`filter_cfg` if use gt bbox for evaluation
+ bbox_file='data/posetrack18/annotations/'
+ 'posetrack18_val_human_detections.json',
+ filter_cfg=dict(bbox_score_thr=0.4),
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='PoseTrack18Metric',
+ ann_file=data_root + 'annotations/posetrack18_val.json',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py
new file mode 100644
index 0000000..22c7c11
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py
@@ -0,0 +1,126 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[10, 15],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='posetrack18/Total AP', rule='greater', interval=1))
+
+# load from the pretrained model
+load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth' # noqa: E501
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=17,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'PoseTrack18Dataset'
+data_mode = 'topdown'
+data_root = 'data/posetrack18/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/posetrack18_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='PoseTrack18Metric',
+ ann_file=data_root + 'annotations/posetrack18_val.json',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md
new file mode 100644
index 0000000..ed247c3
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/README.md
@@ -0,0 +1,32 @@
+# Top-down regression-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html).
+
+
+

+
+
+## Results and Models
+
+### COCO Dataset
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | AP | AR | Details and Download |
+| :--------------: | :--------: | :---: | :---: | :-------------------------------------------------------: |
+| ResNet-152+RLE | 256x192 | 0.731 | 0.805 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) |
+| ResNet-101+RLE | 256x192 | 0.722 | 0.768 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) |
+| ResNet-50+RLE | 256x192 | 0.706 | 0.768 | [resnet_rle_coco.md](./coco/resnet_rle_coco.md) |
+| MobileNet-v2+RLE | 256x192 | 0.593 | 0.644 | [mobilenetv2_rle_coco.md](./coco/mobilenetv2_rle_coco.md) |
+| ResNet-152 | 256x192 | 0.584 | 0.688 | [resnet_coco.md](./coco/resnet_coco.md) |
+| ResNet-101 | 256x192 | 0.562 | 0.670 | [resnet_coco.md](./coco/resnet_coco.md) |
+| ResNet-50 | 256x192 | 0.528 | 0.639 | [resnet_coco.md](./coco/resnet_coco.md) |
+
+### MPII Dataset
+
+| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download |
+| :-----------: | :--------: | :------: | :------: | :---------------------------------------------: |
+| ResNet-50+RLE | 256x256 | 0.861 | 0.277 | [resnet_rle_mpii.md](./mpii/resnet_rle_mpii.md) |
+| ResNet-152 | 256x256 | 0.850 | 0.208 | [resnet_mpii.md](./mpii/resnet_mpii.md) |
+| ResNet-101 | 256x256 | 0.841 | 0.200 | [resnet_mpii.md](./mpii/resnet_mpii.md) |
+| ResNet-50 | 256x256 | 0.826 | 0.180 | [resnet_mpii.md](./mpii/resnet_mpii.md) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md
new file mode 100644
index 0000000..825c40c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.md
@@ -0,0 +1,74 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+RLE (ICCV'2021)
+
+```bibtex
+@inproceedings{li2021human,
+ title={Human pose regression with residual log-likelihood estimation},
+ author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={11025--11034},
+ year={2021}
+}
+```
+
+
+
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [deeppose_mobilenetv2_rle_pretrained](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py) | 256x192 | 0.593 | 0.836 | 0.660 | 0.644 | 0.877 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml
new file mode 100644
index 0000000..1dda49e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/mobilenetv2_rle_coco.yml
@@ -0,0 +1,20 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py
+ In Collection: RLE
+ Metadata:
+ Architecture: &id001
+ - DeepPose
+ - RLE
+ - MobileNet
+ Training Data: COCO
+ Name: td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.593
+ AP@0.5: 0.836
+ AP@0.75: 0.66
+ AR: 0.644
+ AR@0.5: 0.877
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192-39b73bd5_20220922.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md
new file mode 100644
index 0000000..fd9a8c8
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.md
@@ -0,0 +1,59 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [deeppose_resnet_50](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.541 | 0.824 | 0.601 | 0.649 | 0.893 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.log.json) |
+| [deeppose_resnet_101](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.562 | 0.831 | 0.629 | 0.670 | 0.900 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_20210205.log.json) |
+| [deeppose_resnet_152](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py) | 256x192 | 0.584 | 0.842 | 0.659 | 0.688 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_20210205.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml
new file mode 100644
index 0000000..07fae5e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_coco.yml
@@ -0,0 +1,57 @@
+Collections:
+- Name: DeepPose
+ Paper:
+ Title: "Deeppose: Human pose estimation via deep neural networks"
+ URL: http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/deeppose.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: &id001
+ - DeepPose
+ - ResNet
+ Training Data: COCO
+ Name: td-reg_res50_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.541
+ AP@0.5: 0.824
+ AP@0.75: 0.601
+ AR: 0.649
+ AR@0.5: 0.893
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192-72ef04f3_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res101_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.562
+ AP@0.5: 0.831
+ AP@0.75: 0.629
+ AR: 0.67
+ AR@0.5: 0.9
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192-2f247111_20210205.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res152_8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.584
+ AP@0.5: 0.842
+ AP@0.75: 0.659
+ AR: 0.688
+ AR@0.5: 0.907
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192-7df89a88_20210205.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md
new file mode 100644
index 0000000..365d244
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.md
@@ -0,0 +1,78 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+RLE (ICCV'2021)
+
+```bibtex
+@inproceedings{li2021human,
+ title={Human pose regression with residual log-likelihood estimation},
+ author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={11025--11034},
+ year={2021}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [deeppose_resnet_50_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.706 | 0.888 | 0.776 | 0.753 | 0.924 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.log.json) |
+| [deeppose_resnet_50_rle_pretrained](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py) | 256x192 | 0.719 | 0.891 | 0.788 | 0.764 | 0.925 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.log.json) |
+| [deeppose_resnet_101_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.894 | 0.794 | 0.768 | 0.930 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle-16c3d461_20220615.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle_20220615.log.json) |
+| [deeppose_resnet_152_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py) | 256x192 | 0.731 | 0.897 | 0.805 | 0.777 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle-c05bdccf_20220615.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle_20220615.log.json) |
+| [deeppose_resnet_152_rle](/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.901 | 0.815 | 0.793 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle-b77c4c37_20220624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle_20220624.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml
new file mode 100644
index 0000000..dd910e6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/resnet_rle_coco.yml
@@ -0,0 +1,90 @@
+Collections:
+- Name: RLE
+ Paper:
+ Title: Human pose regression with residual log-likelihood estimation
+ URL: https://arxiv.org/abs/2107.11291
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/rle.md
+Models:
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py
+ In Collection: RLE
+ Metadata:
+ Architecture: &id001
+ - DeepPose
+ - RLE
+ - ResNet
+ Training Data: COCO
+ Name: td-reg_res50_rle-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.706
+ AP@0.5: 0.888
+ AP@0.75: 0.776
+ AR: 0.753
+ AR@0.5: 0.924
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192-d37efd64_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py
+ In Collection: RLE
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.719
+ AP@0.5: 0.891
+ AP@0.75: 0.788
+ AR: 0.764
+ AR@0.5: 0.925
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192-2cb494ee_20220913.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py
+ In Collection: RLE
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res101_rle-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.722
+ AP@0.5: 0.894
+ AP@0.75: 0.794
+ AR: 0.768
+ AR@0.5: 0.93
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_coco_256x192_rle-16c3d461_20220615.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py
+ In Collection: RLE
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res152_rle-8xb64-210e_coco-256x192
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.731
+ AP@0.5: 0.897
+ AP@0.75: 0.805
+ AR: 0.777
+ AR@0.5: 0.933
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_256x192_rle-c05bdccf_20220615.pth
+- Config: configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py
+ In Collection: RLE
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: td-reg_res152_rle-8xb64-210e_coco-384x288
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.749
+ AP@0.5: 0.901
+ AP@0.75: 0.815
+ AR: 0.793
+ AR@0.5: 0.935
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_coco_384x288_rle-b77c4c37_20220624.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..c1a2232
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py
@@ -0,0 +1,126 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/top_down/'
+ 'mobilenetv2/mobilenetv2_coco_256x192-d1e58e7b_20200727.pth')),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=1280,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ),
+)
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..e55f676
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..b18ea03
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..64c621f
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..fa35cfa
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py
new file mode 100644
index 0000000..06fa1c7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(288, 384))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..09016f6
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..ceccb7a
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py
new file mode 100644
index 0000000..a1d485c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=1e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=train_cfg['max_epochs'],
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(192, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=17,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/person_keypoints_val2017.json',
+ bbox_file=f'{data_root}person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=f'{data_root}annotations/person_keypoints_val2017.json',
+ score_mode='bbox_rle')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md
new file mode 100644
index 0000000..af6df37
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.md
@@ -0,0 +1,58 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [deeppose_resnet_50](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py) | 256x256 | 0.826 | 0.180 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_20210203.log.json) |
+| [deeppose_resnet_101](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py) | 256x256 | 0.841 | 0.200 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256_20210205.log.json) |
+| [deeppose_resnet_152](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py) | 256x256 | 0.850 | 0.208 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256_20210205.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml
new file mode 100644
index 0000000..95484bc
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_mpii.yml
@@ -0,0 +1,42 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: &id001
+ - DeepPose
+ - ResNet
+ Training Data: MPII
+ Name: td-reg_res50_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.826
+ Mean@0.1: 0.18
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256-c63cd0b6_20210203.pth
+- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-reg_res101_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.841
+ Mean@0.1: 0.2
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res101_mpii_256x256-87516a90_20210205.pth
+- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture: *id001
+ Training Data: MPII
+ Name: td-reg_res152_8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.85
+ Mean@0.1: 0.208
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res152_mpii_256x256-15f5e6f9_20210205.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md
new file mode 100644
index 0000000..9e88cfa
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.md
@@ -0,0 +1,73 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+RLE (ICCV'2021)
+
+```bibtex
+@inproceedings{li2021human,
+ title={Human pose regression with residual log-likelihood estimation},
+ author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu},
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+ pages={11025--11034},
+ year={2021}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+MPII (CVPR'2014)
+
+```bibtex
+@inproceedings{andriluka14cvpr,
+ author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
+ title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2014},
+ month = {June}
+}
+```
+
+
+
+Results on MPII val set
+
+| Arch | Input Size | Mean | Mean@0.1 | ckpt | log |
+| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: |
+| [deeppose_resnet_50_rle](/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py) | 256x256 | 0.861 | 0.277 | [ckpt](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle-5f92a619_20220504.pth) | [log](https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle_20220504.log.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml
new file mode 100644
index 0000000..c2d3237
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/resnet_rle_mpii.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py
+ In Collection: RLE
+ Metadata:
+ Architecture:
+ - DeepPose
+ - RLE
+ - ResNet
+ Training Data: MPII
+ Name: td-reg_res50_rle-8xb64-210e_mpii-256x256
+ Results:
+ - Dataset: MPII
+ Metrics:
+ Mean: 0.861
+ Mean@0.1: 0.277
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/deeppose/deeppose_res50_mpii_256x256_rle-5f92a619_20220504.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..1576670
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,116 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=16,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..f814b67
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,118 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=16,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+file_client_args = dict(backend='disk')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', file_client_args=file_client_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', file_client_args=file_client_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..a2ab46c
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py
@@ -0,0 +1,116 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=16,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py
new file mode 100644
index 0000000..922cee2
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py
@@ -0,0 +1,116 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RLEHead',
+ in_channels=2048,
+ num_joints=16,
+ loss=dict(type='RLELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'MpiiDataset'
+data_mode = 'topdown'
+data_root = 'data/mpii/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform', shift_prob=0),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/mpii_val.json',
+ headbox_file=f'{data_root}/annotations/mpii_gt_val.mat',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater'))
+
+# evaluators
+val_evaluator = dict(type='MpiiPCKAccuracy')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md
new file mode 100644
index 0000000..cce6151
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/README.md
@@ -0,0 +1,22 @@
+# YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss
+
+
+
+
+YOLO-Pose (CVPRW'2022)
+
+```bibtex
+@inproceedings{maji2022yolo,
+ title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
+ author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={2637--2646},
+ year={2022}
+}
+```
+
+
+
+YOLO-Pose is a bottom-up pose estimation approach that simultaneously detects all person instances and regresses keypoint locations in a single pass.
+
+We implement **YOLOX-Pose** based on the **YOLOX** object detection framework and inherits the benefits of unified pose estimation and object detection from YOLO-pose. To predict keypoint locations more accurately, separate branches with adaptive convolutions are used to regress the offsets for different joints. This allows optimizing the feature extraction for each keypoint.
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md
new file mode 100644
index 0000000..4d1b9e7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.md
@@ -0,0 +1,59 @@
+
+
+
+YOLO-Pose (CVPRW'2022)
+
+```bibtex
+@inproceedings{maji2022yolo,
+ title={Yolo-pose: Enhancing yolo for multi person pose estimation using object keypoint similarity loss},
+ author={Maji, Debapriya and Nagori, Soyeb and Mathew, Manu and Poddar, Deepak},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={2637--2646},
+ year={2022}
+}
+```
+
+
+
+
+
+
+YOLOX
+
+```bibtex
+@article{ge2021yolox,
+ title={Yolox: Exceeding yolo series in 2021},
+ author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
+ journal={arXiv preprint arXiv:2107.08430},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+Results on COCO val2017
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
+| [yoloxpose_tiny](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py) | 416x416 | 0.526 | 0.793 | 0.556 | 0.571 | 0.833 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-20230829.json) |
+| [yoloxpose_s](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py) | 640x640 | 0.641 | 0.872 | 0.702 | 0.682 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-20230829.json) |
+| [yoloxpose_m](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py) | 640x640 | 0.695 | 0.899 | 0.766 | 0.733 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-20230829.json) |
+| [yoloxpose_l](/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py) | 640x640 | 0.712 | 0.901 | 0.782 | 0.749 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-20230829.json) |
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
new file mode 100644
index 0000000..400807e
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_coco.yml
@@ -0,0 +1,72 @@
+Collections:
+- Name: YOLOXPose
+ Paper:
+ Title: 'YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss'
+ URL: https://arxiv.org/abs/2204.06806
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/yolopose.md
+Models:
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
+ In Collection: YOLOXPose
+ Metadata:
+ Architecture: &id001
+ - YOLOXPose
+ Training Data: COCO
+ Name: yoloxpose_tiny_4xb64-300e_coco-416
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.526
+ AP@0.5: 0.793
+ AP@0.75: 0.556
+ AR: 0.571
+ AR@0.5: 0.833
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_tiny_4xb64-300e_coco-416-76eb44ca_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
+ In Collection: YOLOXPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: yoloxpose_s_8xb32-300e_coco-640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.641
+ AP@0.5: 0.872
+ AP@0.75: 0.702
+ AR: 0.682
+ AR@0.5: 0.902
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_s_8xb32-300e_coco-640-56c79c1f_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
+ In Collection: YOLOXPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: yoloxpose_m_8xb32-300e_coco-640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.695
+ AP@0.5: 0.899
+ AP@0.75: 0.766
+ AR: 0.733
+ AR@0.5: 0.926
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_m_8xb32-300e_coco-640-84e9a538_20230829.pth
+- Config: configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
+ In Collection: YOLOXPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO
+ Name: yoloxpose_l_8xb32-300e_coco-640
+ Results:
+ - Dataset: COCO
+ Metrics:
+ AP: 0.712
+ AP@0.5: 0.901
+ AP@0.75: 0.782
+ AR: 0.749
+ AR@0.5: 0.926
+ Task: Body 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/yolox_pose/yoloxpose_l_8xb32-300e_coco-640-de0f8dee_20230829.pth
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
new file mode 100644
index 0000000..a339ed9
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_l_8xb32-300e_coco-640.py
@@ -0,0 +1,17 @@
+_base_ = './yoloxpose_s_8xb32-300e_coco-640.py'
+
+widen_factor = 1
+deepen_factor = 1
+checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \
+ 'l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth'
+
+# model settings
+model = dict(
+ backbone=dict(
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ init_cfg=dict(checkpoint=checkpoint),
+ ),
+ neck=dict(
+ in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3),
+ head=dict(head_module_cfg=dict(widen_factor=widen_factor)))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
new file mode 100644
index 0000000..613cfc5
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_m_8xb32-300e_coco-640.py
@@ -0,0 +1,16 @@
+_base_ = './yoloxpose_s_8xb32-300e_coco-640.py'
+
+widen_factor = 0.75
+deepen_factor = 0.67
+checkpoint = 'https://download.openmmlab.com/mmpose/v1/pretrained_models/' \
+ 'yolox_m_8x8_300e_coco_20230829.pth'
+
+# model settings
+model = dict(
+ backbone=dict(
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ init_cfg=dict(checkpoint=checkpoint),
+ ),
+ neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+ head=dict(head_module_cfg=dict(widen_factor=widen_factor)))
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
new file mode 100644
index 0000000..3c935ae
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_s_8xb32-300e_coco-640.py
@@ -0,0 +1,266 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(
+ _delete_=True,
+ type='EpochBasedTrainLoop',
+ max_epochs=300,
+ val_interval=10,
+ dynamic_intervals=[(280, 1)])
+
+auto_scale_lr = dict(base_batch_size=256)
+
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0,
+ bias_decay_mult=0,
+ bypass_duplicate=True,
+ ),
+ clip_grad=dict(max_norm=0.1, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0002,
+ begin=5,
+ T_max=280,
+ end=280,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=280, end=300),
+]
+
+# model
+widen_factor = 0.5
+deepen_factor = 0.33
+
+model = dict(
+ type='BottomupPoseEstimator',
+ init_cfg=dict(
+ type='Kaiming',
+ layer='Conv2d',
+ a=2.23606797749979,
+ distribution='uniform',
+ mode='fan_in',
+ nonlinearity='leaky_relu'),
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[0, 0, 0],
+ std=[1, 1, 1],
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ out_indices=(2, 3, 4),
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+ 'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
+ '20211121_095711-4592a793.pth',
+ prefix='backbone.',
+ )),
+ neck=dict(
+ type='YOLOXPAFPN',
+ in_channels=[128, 256, 512],
+ out_channels=128,
+ num_csp_blocks=1,
+ use_depthwise=False,
+ upsample_cfg=dict(scale_factor=2, mode='nearest'),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ head=dict(
+ type='YOLOXPoseHead',
+ num_keypoints=17,
+ featmap_strides=(8, 16, 32),
+ head_module_cfg=dict(
+ num_classes=1,
+ in_channels=256,
+ feat_channels=256,
+ widen_factor=widen_factor,
+ stacked_convs=2,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ prior_generator=dict(
+ type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+ assigner=dict(type='SimOTAAssigner', dynamic_k_indicator='oks'),
+ overlaps_power=0.5,
+ loss_cls=dict(type='BCELoss', reduction='sum', loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_obj=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='sum',
+ loss_weight=1.0),
+ loss_oks=dict(
+ type='OKSLoss',
+ reduction='none',
+ metainfo='configs/_base_/datasets/coco.py',
+ norm_target_weight=True,
+ loss_weight=30.0),
+ loss_vis=dict(
+ type='BCELoss',
+ use_target_weight=True,
+ reduction='mean',
+ loss_weight=1.0),
+ loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
+ ),
+ test_cfg=dict(
+ score_thr=0.01,
+ nms_thr=0.65,
+ ))
+
+# data
+input_size = (640, 640)
+codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
+
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=(640, 640),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(
+ type='YOLOXMixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=(640, 640),
+ shift_prob=0,
+ rotate_prob=0,
+ scale_prob=0,
+ scale_type='long',
+ pad_val=(114, 114, 114),
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs'),
+]
+
+data_mode = 'bottomup'
+data_root = 'data/'
+
+dataset_coco = dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
+ ann_file='coco/annotations/person_keypoints_train2017.json',
+ data_prefix=dict(img='coco/train2017/'),
+ pipeline=train_pipeline_stage1,
+)
+
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=8,
+ persistent_workers=True,
+ pin_memory=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dataset_coco)
+
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ pin_memory=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/person_keypoints_val2017.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
+ score_mode='bbox',
+ nms_mode='none',
+)
+test_evaluator = val_evaluator
+
+custom_hooks = [
+ dict(
+ type='YOLOXPoseModeSwitchHook',
+ num_last_epochs=20,
+ new_train_pipeline=train_pipeline_stage2,
+ priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ strict_load=False,
+ priority=49),
+]
diff --git a/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
new file mode 100644
index 0000000..68a5ad7
--- /dev/null
+++ b/modules/rtmpose/configs/body_2d_keypoint/yoloxpose/coco/yoloxpose_tiny_4xb64-300e_coco-416.py
@@ -0,0 +1,77 @@
+_base_ = './yoloxpose_s_8xb32-300e_coco-640.py'
+
+# model settings
+widen_factor = 0.375
+deepen_factor = 0.33
+checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_' \
+ 'tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth'
+
+model = dict(
+ data_preprocessor=dict(batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(320, 640),
+ size_divisor=32,
+ interval=1),
+ ]),
+ backbone=dict(
+ deepen_factor=deepen_factor,
+ widen_factor=widen_factor,
+ init_cfg=dict(checkpoint=checkpoint),
+ ),
+ neck=dict(
+ in_channels=[96, 192, 384],
+ out_channels=96,
+ ),
+ head=dict(head_module_cfg=dict(widen_factor=widen_factor), ))
+
+# dataset settings
+train_pipeline_stage1 = [
+ dict(type='LoadImage', backend_args=None),
+ dict(
+ type='Mosaic',
+ img_scale=_base_.input_size,
+ pad_val=114.0,
+ pre_transform=[dict(type='LoadImage', backend_args=None)]),
+ dict(
+ type='BottomupRandomAffine',
+ input_size=_base_.input_size,
+ shift_factor=0.1,
+ rotate_factor=10,
+ scale_factor=(0.75, 1.0),
+ pad_val=114,
+ distribution='uniform',
+ transform_mode='perspective',
+ bbox_keep_corner=False,
+ clip_border=True,
+ ),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip'),
+ dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
+ dict(type='GenerateTarget', encoder=_base_.codec),
+ dict(
+ type='PackPoseInputs',
+ extra_mapping_labels={
+ 'bbox': 'bboxes',
+ 'bbox_labels': 'labels',
+ 'keypoints': 'keypoints',
+ 'keypoints_visible': 'keypoints_visible',
+ 'area': 'areas'
+ }),
+]
+train_dataloader = dict(
+ batch_size=64, dataset=dict(pipeline=train_pipeline_stage1))
+
+input_size = (416, 416)
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(
+ type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'input_size', 'input_center', 'input_scale'))
+]
+
+val_dataloader = dict(dataset=dict(pipeline=val_pipeline, ))
+test_dataloader = val_dataloader
diff --git a/modules/rtmpose/configs/body_3d_keypoint/README.md b/modules/rtmpose/configs/body_3d_keypoint/README.md
new file mode 100644
index 0000000..0010706
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/README.md
@@ -0,0 +1,13 @@
+# Human Body 3D Pose Estimation
+
+3D pose estimation is the detection and analysis of X, Y, Z coordinates of human body joints from RGB images. For single-person 3D pose estimation from a monocular camera, existing works can be classified into three categories: (1) from 2D poses to 3D poses (2D-to-3D pose lifting) (2) jointly learning 2D and 3D poses, and (3) directly regressing 3D poses from images.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/3d_body_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/3d_human_pose_demo.md) to run demos.
+
+
diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md
new file mode 100644
index 0000000..78507e0
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/README.md
@@ -0,0 +1,13 @@
+# A simple yet effective baseline for 3d human pose estimation
+
+Simple 3D baseline proposes to break down the task of 3d human pose estimation into 2 stages: (1) Image → 2D pose (2) 2D pose → 3D pose.
+
+The authors find that "lifting" ground truth 2D joint locations to 3D space is a task that can be solved with a low error rate. Based on the success of 2d human pose estimation, it directly "lifts" 2d joint locations to 3d space.
+
+## Results and Models
+
+### Human3.6m Dataset
+
+| Arch | MPJPE | P-MPJPE | ckpt | log | Details and Download |
+| :------------------------------------------ | :---: | :-----: | :-----------------------------------------: | :-----------------------------------------: | :---------------------------------------------------------: |
+| [SimpleBaseline3D](/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py) | 43.4 | 34.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) | [simplebaseline3d_h36m.md](./h36m/simplebaseline3d_h36m.md) |
diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py
new file mode 100644
index 0000000..15af0f5
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py
@@ -0,0 +1,168 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=200, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
+
+# learning policy
+param_scheduler = [
+ dict(type='StepLR', step_size=100000, gamma=0.96, end=80, by_epoch=False)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1))
+
+# codec settings
+# 3D keypoint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint3d_rel_stats.pkl'
+target_mean = [[-2.55652589e-04, -7.11960570e-03, -9.81433052e-04],
+ [-5.65463051e-03, 3.19636009e-01, 7.19329269e-02],
+ [-1.01705840e-02, 6.91147892e-01, 1.55352986e-01],
+ [2.55651315e-04, 7.11954606e-03, 9.81423866e-04],
+ [-5.09729780e-03, 3.27040413e-01, 7.22258095e-02],
+ [-9.99656606e-03, 7.08277383e-01, 1.58016408e-01],
+ [2.90583676e-03, -2.11363307e-01, -4.74210915e-02],
+ [5.67537804e-03, -4.35088906e-01, -9.76974016e-02],
+ [5.93884964e-03, -4.91891970e-01, -1.10666618e-01],
+ [7.37352083e-03, -5.83948619e-01, -1.31171400e-01],
+ [5.41920653e-03, -3.83931702e-01, -8.68145417e-02],
+ [2.95964662e-03, -1.87567488e-01, -4.34536934e-02],
+ [1.26585822e-03, -1.20170579e-01, -2.82526049e-02],
+ [4.67186639e-03, -3.83644089e-01, -8.55125784e-02],
+ [1.67648571e-03, -1.97007177e-01, -4.31368364e-02],
+ [8.70569015e-04, -1.68664569e-01, -3.73902498e-02]],
+target_std = [[0.11072244, 0.02238818, 0.07246294],
+ [0.15856311, 0.18933832, 0.20880479],
+ [0.19179935, 0.24320062, 0.24756193],
+ [0.11072181, 0.02238805, 0.07246253],
+ [0.15880454, 0.19977188, 0.2147063],
+ [0.18001944, 0.25052739, 0.24853247],
+ [0.05210694, 0.05211406, 0.06908241],
+ [0.09515367, 0.10133032, 0.12899733],
+ [0.11742458, 0.12648469, 0.16465091],
+ [0.12360297, 0.13085539, 0.16433336],
+ [0.14602232, 0.09707956, 0.13952731],
+ [0.24347532, 0.12982249, 0.20230181],
+ [0.2446877, 0.21501816, 0.23938235],
+ [0.13876084, 0.1008926, 0.1424411],
+ [0.23687529, 0.14491219, 0.20980829],
+ [0.24400695, 0.23975028, 0.25520584]]
+# 2D keypoint normalization parameters
+# From file: '{data_root}/annotation_body3d/fps50/joint2d_stats.pkl'
+keypoints_mean = [[532.08351635, 419.74137558], [531.80953144, 418.2607141],
+ [530.68456967, 493.54259285], [529.36968722, 575.96448516],
+ [532.29767646, 421.28483336], [531.93946631, 494.72186795],
+ [529.71984447, 578.96110365], [532.93699382, 370.65225054],
+ [534.1101856, 317.90342311], [534.55416813, 304.24143901],
+ [534.86955004, 282.31030885], [534.11308566, 330.11296796],
+ [533.53637525, 376.2742511], [533.49380107, 391.72324565],
+ [533.52579142, 330.09494668], [532.50804964, 374.190479],
+ [532.72786934, 380.61615716]],
+keypoints_std = [[107.73640054, 63.35908715], [119.00836213, 64.1215443],
+ [119.12412107, 50.53806215], [120.61688045, 56.38444891],
+ [101.95735275, 62.89636486], [106.24832897, 48.41178119],
+ [108.46734966, 54.58177071], [109.07369806, 68.70443672],
+ [111.20130351, 74.87287863], [111.63203838, 77.80542514],
+ [113.22330788, 79.90670556], [105.7145833, 73.27049436],
+ [107.05804267, 73.93175781], [107.97449418, 83.30391802],
+ [121.60675105, 74.25691526], [134.34378973, 77.48125087],
+ [131.79990652, 89.86721124]]
+codec = dict(
+ type='ImagePoseLifting',
+ num_keypoints=17,
+ root_index=0,
+ remove_root=True,
+ target_mean=target_mean,
+ target_std=target_std,
+ keypoints_mean=keypoints_mean,
+ keypoints_std=keypoints_std)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(1, 1, 1),
+ dropout=0.5,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=16,
+ loss=dict(type='MSELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root', 'target_root_index', 'target_mean',
+ 'target_std'))
+]
+val_pipeline = train_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=1,
+ causal=True,
+ keypoint_2d_src='gt',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=1,
+ causal=True,
+ keypoint_2d_src='gt',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md
new file mode 100644
index 0000000..e710a15
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.md
@@ -0,0 +1,44 @@
+
+
+
+SimpleBaseline3D (ICCV'2017)
+
+```bibtex
+@inproceedings{martinez_2017_3dbaseline,
+ title={A simple yet effective baseline for 3d human pose estimation},
+ author={Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
+ booktitle={ICCV},
+ year={2017}
+}
+```
+
+
+
+
+
+
+Human3.6M (TPAMI'2014)
+
+```bibtex
+@article{h36m_pami,
+ author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
+ title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+ journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+ publisher = {IEEE Computer Society},
+ volume = {36},
+ number = {7},
+ pages = {1325-1339},
+ month = {jul},
+ year = {2014}
+}
+```
+
+
+
+Results on Human3.6M dataset with ground truth 2D detections
+
+| Arch | MPJPE | P-MPJPE | ckpt | log |
+| :-------------------------------------------------------------- | :---: | :-----: | :-------------------------------------------------------------: | :------------------------------------------------------------: |
+| [SimpleBaseline3D1](/configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py) | 43.4 | 34.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth) | [log](https://download.openmmlab.com/mmpose/body3d/simple_baseline/20210415_065056.log.json) |
+
+1 Differing from the original paper, we didn't apply the `max-norm constraint` because we found this led to a better convergence and performance.
diff --git a/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml
new file mode 100644
index 0000000..c06b349
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/image_pose_lift/h36m/simplebaseline3d_h36m.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: SimpleBaseline3D
+ Paper:
+ Title: A simple yet effective baseline for 3d human pose estimation
+ URL: http://openaccess.thecvf.com/content_iccv_2017/html/Martinez_A_Simple_yet_ICCV_2017_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/simplebaseline3d.md
+Models:
+- Config: configs/body_3d_keypoint/image_pose_lift/h36m/image-pose-lift_tcn_8xb64-200e_h36m.py
+ In Collection: SimpleBaseline3D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline3D
+ Training Data: Human3.6M
+ Name: image-pose-lift_tcn_8xb64-200e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 43.4
+ P-MPJPE: 34.3
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/simple_baseline/simple3Dbaseline_h36m-f0ad73a4_20210419.pth
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md b/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md
new file mode 100644
index 0000000..d6a96e3
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/README.md
@@ -0,0 +1,23 @@
+# MotionBERT: A Unified Perspective on Learning Human Motion Representations
+
+Motionbert proposes a pretraining stage in which a motion encoder is trained to recover the underlying 3D motion from noisy partial 2D observations. The motion representations acquired in this way incorporate geometric, kinematic, and physical knowledge about human motion, which can be easily transferred to multiple downstream tasks.
+
+## Results and Models
+
+### Human3.6m Dataset
+
+| Arch | MPJPE | P-MPJPE | ckpt | log | Details and Download |
+| :-------------------------------------------------------------------- | :---: | :-----: | :-------------------------------------------------------------------: | :-: | :---------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py) | 35.3 | 27.7 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py) | 27.5 | 21.6 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+### Human3.6m Dataset from official repo 1
+
+| Arch | MPJPE | Average MPJPE | P-MPJPE | ckpt | log | Details and Download |
+| :------------------------------------------------------------- | :---: | :-----------: | :-----: | :-------------------------------------------------------------: | :-: | :---------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py) | 39.8 | 39.2 | 33.4 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py) | 37.7 | 37.2 | 32.2 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / | [motionbert_h36m.md](./h36m/motionbert_h36m.md) |
+
+1 Please refer to the [doc](./h36m/motionbert_h36m.md) for more details.
+
+*Models with * are converted from the official repo. The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py
new file mode 100644
index 0000000..30ab90d
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py
@@ -0,0 +1,137 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=240, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='DSTFormer',
+ in_channels=3,
+ feat_size=512,
+ depth=5,
+ num_heads=8,
+ mlp_ratio=2,
+ seq_len=243,
+ att_fuse=True,
+ ),
+ head=dict(
+ type='MotionRegressionHead',
+ in_channels=512,
+ out_channels=3,
+ embedding_size=512,
+ loss=dict(type='MPJPEVelocityJointLoss'),
+ decoder=val_codec,
+ ),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='GenerateTarget', encoder=train_codec),
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+ target_flip_cfg=dict(center_mode='static', center_x=0.),
+ flip_label=True),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=val_codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+ seq_len=1,
+ multiple_target=243,
+ multiple_target_step=81,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+
+val_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+ factor_file='annotation_body3d/fps50/h36m_factors.npy',
+ seq_len=1,
+ seq_step=1,
+ multiple_target=243,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+ 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+ dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py
new file mode 100644
index 0000000..6909ec2
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py
@@ -0,0 +1,136 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=240, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='DSTFormer',
+ in_channels=3,
+ feat_size=512,
+ depth=5,
+ num_heads=8,
+ mlp_ratio=2,
+ seq_len=243,
+ att_fuse=True,
+ ),
+ head=dict(
+ type='MotionRegressionHead',
+ in_channels=512,
+ out_channels=3,
+ embedding_size=512,
+ loss=dict(type='MPJPEVelocityJointLoss'),
+ decoder=val_codec,
+ ),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='GenerateTarget', encoder=train_codec),
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+ target_flip_cfg=dict(center_mode='static', center_x=0.),
+ flip_label=True),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=val_codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=1,
+ multiple_target=243,
+ multiple_target_step=81,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+
+val_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=1,
+ seq_step=1,
+ multiple_target=243,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+ 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+ dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py
new file mode 100644
index 0000000..5db0db7
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py
@@ -0,0 +1,142 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='DSTFormer',
+ in_channels=3,
+ feat_size=512,
+ depth=5,
+ num_heads=8,
+ mlp_ratio=2,
+ seq_len=243,
+ att_fuse=True,
+ ),
+ head=dict(
+ type='MotionRegressionHead',
+ in_channels=512,
+ out_channels=3,
+ embedding_size=512,
+ loss=dict(type='MPJPEVelocityJointLoss'),
+ decoder=val_codec,
+ ),
+ test_cfg=dict(flip_test=True),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+ 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='GenerateTarget', encoder=train_codec),
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+ target_flip_cfg=dict(center_mode='static', center_x=0.),
+ flip_label=True),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=val_codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+ seq_len=1,
+ multiple_target=243,
+ multiple_target_step=81,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+
+val_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+ factor_file='annotation_body3d/fps50/h36m_factors.npy',
+ seq_len=1,
+ seq_step=1,
+ multiple_target=243,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+ 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+ dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py
new file mode 100644
index 0000000..47f050b
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py
@@ -0,0 +1,141 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+ optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+ type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='DSTFormer',
+ in_channels=3,
+ feat_size=512,
+ depth=5,
+ num_heads=8,
+ mlp_ratio=2,
+ seq_len=243,
+ att_fuse=True,
+ ),
+ head=dict(
+ type='MotionRegressionHead',
+ in_channels=512,
+ out_channels=3,
+ embedding_size=512,
+ loss=dict(type='MPJPEVelocityJointLoss'),
+ decoder=val_codec,
+ ),
+ test_cfg=dict(flip_test=True),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+ 'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='GenerateTarget', encoder=train_codec),
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+ target_flip_cfg=dict(center_mode='static', center_x=0.),
+ flip_label=True),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=val_codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=1,
+ multiple_target=243,
+ multiple_target_step=81,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+
+val_dataloader = dict(
+ batch_size=32,
+ prefetch_factor=4,
+ pin_memory=True,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=1,
+ seq_step=1,
+ multiple_target=243,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+ 'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+ dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md
new file mode 100644
index 0000000..0d3ee29
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.md
@@ -0,0 +1,55 @@
+
+
+
+MotionBERT (2022)
+
+```bibtex
+ @misc{Zhu_Ma_Liu_Liu_Wu_Wang_2022,
+ title={Learning Human Motion Representations: A Unified Perspective},
+ author={Zhu, Wentao and Ma, Xiaoxuan and Liu, Zhaoyang and Liu, Libin and Wu, Wayne and Wang, Yizhou},
+ year={2022},
+ month={Oct},
+ language={en-US}
+ }
+```
+
+
+
+
+
+
+Human3.6M (TPAMI'2014)
+
+```bibtex
+@article{h36m_pami,
+author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
+title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+publisher = {IEEE Computer Society},
+volume = {36},
+number = {7},
+pages = {1325-1339},
+month = {jul},
+year = {2014}
+}
+```
+
+
+
+Results on Human3.6M dataset with ground truth 2D detections
+
+| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt |
+| :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
+| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py) | 34.5 | 34.6 | 27.1 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py) | 26.9 | 26.8 | 21.0 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+Results on Human3.6M dataset converted from the [official repo](https://github.com/Walter0807/MotionBERT)1 with ground truth 2D detections
+
+| Arch | MPJPE | average MPJPE | P-MPJPE | ckpt | log |
+| :------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :------------------------------------------------------------------------------------: | :-: |
+| [MotionBERT\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m-original.py) | 39.8 | 39.2 | 33.4 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) | / |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m-original.py) | 37.7 | 37.2 | 32.2 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) | / |
+
+1 By default, we test models with [Human 3.6m dataset](/docs/en/dataset_zoo/3d_body_keypoint.md#human3-6m) processed by MMPose. The official repo's dataset includes more data and applies a different pre-processing technique. To achieve the same result with the official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50` and test with the configs we provided.
+
+*Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml
new file mode 100644
index 0000000..93c4eda
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/motionbert/h36m/motionbert_h36m.yml
@@ -0,0 +1,45 @@
+Collections:
+- Name: MotionBERT
+ Paper:
+ Title: "Learning Human Motion Representations: A Unified Perspective"
+ URL: https://arxiv.org/abs/2210.06551
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/motionbert.md
+Models:
+- Config: configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-243frm_8xb32-240e_h36m.py
+ In Collection: MotionBERT
+ Metadata:
+ Architecture: &id001
+ - MotionBERT
+ Training Data: Human3.6M (MotionBERT)
+ Name: motionbert_dstformer-243frm_8xb32-240e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 34.5
+ P-MPJPE: 27.1
+ Task: Body 3D Keypoint
+ - Dataset: Human3.6M (MotionBERT)
+ Metrics:
+ MPJPE: 39.8
+ P-MPJPE: 33.4
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
+- Config: configs/body_3d_keypoint/motionbert/h36m/motionbert_dstformer-ft-243frm_8xb32-120e_h36m.py
+ In Collection: MotionBERT
+ Alias: human3d
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M (MotionBERT)
+ Name: motionbert_dstformer-ft-243frm_8xb32-120e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 26.9
+ P-MPJPE: 21.0
+ Task: Body 3D Keypoint
+ - Dataset: Human3.6M (MotionBERT)
+ Metrics:
+ MPJPE: 37.7
+ P-MPJPE: 32.2
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md
new file mode 100644
index 0000000..de32a06
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/README.md
@@ -0,0 +1,17 @@
+# 3D human pose estimation in video with temporal convolutions and semi-supervised training
+
+Based on the success of 2d human pose estimation, it directly "lifts" a sequence of 2d keypoints to 3d keypoints.
+
+## Results and Models
+
+### Human3.6m Dataset
+
+| Arch | MPJPE | P-MPJPE | N-MPJPE | ckpt | log | Details and Download |
+| :-------------------------------------------- | :---: | :-----: | :-----: | :-------------------------------------------: | :------------------------------------------: | :---------------------------------------------: |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py) | 40.1 | 30.1 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py) | 39.1 | 29.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py) | 37.6 | 28.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py) | 53.0 | 41.3 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 47.9 | 38.0 | / | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py) | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
+| [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) | [videpose3d_h36m.md](./h36m/videpose3d_h36m.md) |
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py
new file mode 100644
index 0000000..3e164fd
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py
@@ -0,0 +1,132 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=160, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.98, end=80, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=4,
+ kernel_sizes=(1, 1, 1, 1, 1),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(),
+ target_flip_cfg=dict(),
+ ),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=1,
+ causal=False,
+ pad_video_seq=False,
+ keypoint_2d_src='detection',
+ keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_train.npy',
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ),
+)
+val_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=1,
+ causal=False,
+ pad_video_seq=False,
+ keypoint_2d_src='detection',
+ keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy',
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py
new file mode 100644
index 0000000..592eac9
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py
@@ -0,0 +1,132 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=200, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-4))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.98, end=200, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=4,
+ kernel_sizes=(3, 3, 3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(),
+ target_flip_cfg=dict(),
+ ),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=243,
+ causal=False,
+ pad_video_seq=True,
+ keypoint_2d_src='detection',
+ keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_train.npy',
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ),
+)
+val_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=243,
+ causal=False,
+ pad_video_seq=True,
+ keypoint_2d_src='detection',
+ keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy',
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py
new file mode 100644
index 0000000..edcf918
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py
@@ -0,0 +1,128 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=160, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=4,
+ kernel_sizes=(3, 3, 3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(),
+ target_flip_cfg=dict(),
+ ),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=243,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ),
+)
+val_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=243,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py
new file mode 100644
index 0000000..842bee6
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py
@@ -0,0 +1,119 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = None
+
+# optimizer
+
+# learning policy
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ),
+ traj_backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ traj_head=dict(
+ type='TrajectoryRegressionHead',
+ in_channels=1024,
+ num_joints=1,
+ loss=dict(type='MPJPELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ semi_loss=dict(
+ type='SemiSupervisionLoss',
+ joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+ warmup_iterations=1311376 // 64 // 8 * 5),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=27,
+ causal=False,
+ pad_video_seq=True,
+ keypoint_2d_src='detection',
+ keypoint_2d_det_file='joint_2d_det_files/cpn_ft_h36m_dbb_test.npy',
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe'),
+ dict(type='MPJPE', mode='n-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py
new file mode 100644
index 0000000..7f0e68c
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py
@@ -0,0 +1,117 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = None
+
+# optimizer
+
+# learning policy
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ),
+ traj_backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ traj_head=dict(
+ type='TrajectoryRegressionHead',
+ in_channels=1024,
+ num_joints=1,
+ loss=dict(type='MPJPELoss', use_target_weight=True),
+ decoder=codec,
+ ),
+ semi_loss=dict(
+ type='SemiSupervisionLoss',
+ joint_parents=[0, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15],
+ warmup_iterations=1311376 // 64 // 8 * 5),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+val_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=27,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe'),
+ dict(type='MPJPE', mode='n-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py
new file mode 100644
index 0000000..9d39fe7
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py
@@ -0,0 +1,128 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=160, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=2,
+ kernel_sizes=(3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(),
+ target_flip_cfg=dict(),
+ ),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=27,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ),
+)
+val_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=27,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py
new file mode 100644
index 0000000..e43326e
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py
@@ -0,0 +1,128 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=160, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=1e-3))
+
+# learning policy
+param_scheduler = [
+ dict(type='ExponentialLR', gamma=0.975, end=80, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=1024)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ save_best='MPJPE',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='VideoPoseLifting',
+ num_keypoints=17,
+ zero_center=True,
+ root_index=0,
+ remove_root=False)
+
+# model settings
+model = dict(
+ type='PoseLifter',
+ backbone=dict(
+ type='TCN',
+ in_channels=2 * 17,
+ stem_channels=1024,
+ num_blocks=3,
+ kernel_sizes=(3, 3, 3, 3),
+ dropout=0.25,
+ use_stride_conv=True,
+ ),
+ head=dict(
+ type='TemporalRegressionHead',
+ in_channels=1024,
+ num_joints=17,
+ loss=dict(type='MPJPELoss'),
+ decoder=codec,
+ ))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+ dict(
+ type='RandomFlipAroundRoot',
+ keypoints_flip_cfg=dict(),
+ target_flip_cfg=dict(),
+ ),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+val_pipeline = [
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+ 'target_root'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_train.npz',
+ seq_len=81,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ),
+)
+val_dataloader = dict(
+ batch_size=128,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotation_body3d/fps50/h36m_test.npz',
+ seq_len=81,
+ causal=False,
+ pad_video_seq=True,
+ camera_param_file='annotation_body3d/cameras.pkl',
+ data_root=data_root,
+ data_prefix=dict(img='images/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='MPJPE', mode='mpjpe'),
+ dict(type='MPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md
new file mode 100644
index 0000000..01cef1d
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.md
@@ -0,0 +1,67 @@
+
+
+
+
+VideoPose3D (CVPR'2019)
+
+```bibtex
+@inproceedings{pavllo20193d,
+title={3d human pose estimation in video with temporal convolutions and semi-supervised training},
+author={Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
+booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+pages={7753--7762},
+year={2019}
+}
+```
+
+
+
+
+
+
+Human3.6M (TPAMI'2014)
+
+```bibtex
+@article{h36m_pami,
+author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
+title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+publisher = {IEEE Computer Society},
+volume = {36},
+number = {7},
+pages = {1325-1339},
+month = {jul},
+year = {2014}
+}
+```
+
+
+
+Testing results on Human3.6M dataset with ground truth 2D detections, supervised training
+
+| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log |
+| :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [VideoPose3D-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py) | 27 | 40.1 | 30.1 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-81frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py) | 81 | 39.1 | 29.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised_20210527.log.json) |
+| [VideoPose3D-supervised-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py) | 243 | 37.6 | 28.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_20210527.log.json) |
+
+Testing results on Human3.6M dataset with CPN 2D detections1, supervised training
+
+| Arch | Receptive Field | MPJPE | P-MPJPE | ckpt | log |
+| :--------------------------------------------------------- | :-------------: | :---: | :-----: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [VideoPose3D-supervised-CPN-1frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py) | 1 | 53.0 | 41.3 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft_20210527.log.json) |
+| [VideoPose3D-supervised-CPN-243frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py) | 243 | 47.9 | 38.0 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft_20210527.log.json) |
+
+Testing results on Human3.6M dataset with ground truth 2D detections, semi-supervised training
+
+| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
+| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
+| 10% S1 | [VideoPose3D-semi-supervised-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py) | 27 | 57.2 | 42.4 | 54.2 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_20210527.log.json) |
+
+Testing results on Human3.6M dataset with CPN 2D detections1, semi-supervised training
+
+| Training Data | Arch | Receptive Field | MPJPE | P-MPJPE | N-MPJPE | ckpt | log |
+| :------------ | :-------------------------------------------------: | :-------------: | :---: | :-----: | :-----: | :-------------------------------------------------: | :-------------------------------------------------: |
+| 10% S1 | [VideoPose3D-semi-supervised-CPN-27frm](/configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py) | 27 | 67.3 | 50.4 | 63.6 | [ckpt](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth) | [log](https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft_20210527.log.json) |
+
+1 CPN 2D detections are provided by [official repo](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). The reformatted version used in this repository can be downloaded from [train_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_train.npy) and [test_detection](https://download.openmmlab.com/mmpose/body3d/videopose/cpn_ft_h36m_dbb_test.npy).
diff --git a/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml
new file mode 100644
index 0000000..81e66c5
--- /dev/null
+++ b/modules/rtmpose/configs/body_3d_keypoint/video_pose_lift/h36m/videopose3d_h36m.yml
@@ -0,0 +1,102 @@
+Collections:
+- Name: VideoPose3D
+ Paper:
+ Title: 3d human pose estimation in video with temporal convolutions and semi-supervised
+ training
+ URL: http://openaccess.thecvf.com/content_CVPR_2019/html/Pavllo_3D_Human_Pose_Estimation_in_Video_With_Temporal_Convolutions_and_CVPR_2019_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/en/papers/algorithms/videopose3d.md
+Models:
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: &id001
+ - VideoPose3D
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-27frm-supv_8xb128-160e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 40.0
+ P-MPJPE: 30.1
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_supervised-fe8fbba9_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-81frm-supv_8xb128-160e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 38.9
+ P-MPJPE: 29.2
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_81frames_fullconv_supervised-1f2d1104_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-243frm-supv_8xb128-160e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 37.6
+ P-MPJPE: 28.3
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised-880bea25_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-1frm-supv-cpn-ft_8xb128-160e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 52.9
+ P-MPJPE: 41.3
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_1frame_fullconv_supervised_cpn_ft-5c3afaed_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-243frm-supv-cpn-ft_8xb128-200e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 47.9
+ P-MPJPE: 38.0
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_243frames_fullconv_supervised_cpn_ft-88f5abbb_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-27frm-semi-supv_8xb64-200e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 58.1
+ N-MPJPE: 54.7
+ P-MPJPE: 42.8
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised-54aef83b_20210527.pth
+- Config: configs/body_3d_keypoint/video_pose_lift/h36m/video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m.py
+ In Collection: VideoPose3D
+ Metadata:
+ Architecture: *id001
+ Training Data: Human3.6M
+ Name: video-pose-lift_tcn-27frm-semi-supv-cpn-ft_8xb64-200e_h36m
+ Results:
+ - Dataset: Human3.6M
+ Metrics:
+ MPJPE: 67.4
+ N-MPJPE: 63.2
+ P-MPJPE: 50.1
+ Task: Body 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/body3d/videopose/videopose_h36m_27frames_fullconv_semi-supervised_cpn_ft-71be9cde_20210527.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/README.md b/modules/rtmpose/configs/face_2d_keypoint/README.md
new file mode 100644
index 0000000..058b9b5
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/README.md
@@ -0,0 +1,16 @@
+# 2D Face Landmark Detection
+
+2D face landmark detection (also referred to as face alignment) is defined as the task of detecting the face keypoints from an input image.
+
+Normally, the input images are cropped face images, where the face locates at the center;
+or the rough location (or the bounding box) of the hand is provided.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_face_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/2d_face_demo.md) to run demos.
+
+
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md
new file mode 100644
index 0000000..c04d5bc
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/README.md
@@ -0,0 +1,32 @@
+# RTMPose
+
+Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
+In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
+Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
+To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
+
+## Results and Models
+
+### COCO-WholeBody-Face Dataset
+
+Results on COCO-WholeBody-Face val set
+
+| Model | Input Size | NME | Details and Download |
+| :-------: | :--------: | :----: | :------------------------------------------------------------------------------------: |
+| RTMPose-m | 256x256 | 0.0466 | [rtmpose_coco_wholebody_face.md](./coco_wholebody_face/rtmpose_coco_wholebody_face.md) |
+
+### WFLW Dataset
+
+Results on WFLW dataset
+
+| Model | Input Size | NME | Details and Download |
+| :-------: | :--------: | :--: | :---------------------------------------: |
+| RTMPose-m | 256x256 | 4.01 | [rtmpose_wflw.md](./wflw/rtmpose_wflw.md) |
+
+### LaPa Dataset
+
+Results on LaPa dataset
+
+| Model | Input Size | NME | Details and Download |
+| :-------: | :--------: | :--: | :---------------------------------------: |
+| RTMPose-m | 256x256 | 1.29 | [rtmpose_lapa.md](./lapa/rtmpose_lapa.md) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..07db40c
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,231 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 60
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=68,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md
new file mode 100644
index 0000000..fb09265
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md
@@ -0,0 +1,39 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0466 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml
new file mode 100644
index 0000000..00b0906
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.yml
@@ -0,0 +1,14 @@
+Models:
+- Config: configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: COCO-WholeBody-Face
+ Name: rtmpose-m_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0466
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py
new file mode 100644
index 0000000..22d28dd
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py
@@ -0,0 +1,690 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# lapa coco wflw 300w cofw halpe
+
+# runtime
+max_epochs = 120
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.005,
+ begin=30,
+ end=max_epochs,
+ T_max=max_epochs - 30,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=106,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'LapaDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.2),
+ dict(type='MedianBlur', p=0.2),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# train dataset
+dataset_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+kpt_68_to_106 = [
+ #
+ (0, 0),
+ (1, 2),
+ (2, 4),
+ (3, 6),
+ (4, 8),
+ (5, 10),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 18),
+ (10, 20),
+ (11, 22),
+ (12, 24),
+ (13, 26),
+ (14, 28),
+ (15, 30),
+ (16, 32),
+ #
+ (17, 33),
+ (18, 34),
+ (19, 35),
+ (20, 36),
+ (21, 37),
+ #
+ (22, 42),
+ (23, 43),
+ (24, 44),
+ (25, 45),
+ (26, 46),
+ #
+ (27, 51),
+ (28, 52),
+ (29, 53),
+ (30, 54),
+ #
+ (31, 58),
+ (32, 59),
+ (33, 60),
+ (34, 61),
+ (35, 62),
+ #
+ (36, 66),
+ (39, 70),
+ #
+ ((37, 38), 68),
+ ((40, 41), 72),
+ #
+ (42, 75),
+ (45, 79),
+ #
+ ((43, 44), 77),
+ ((46, 47), 81),
+ #
+ (48, 84),
+ (49, 85),
+ (50, 86),
+ (51, 87),
+ (52, 88),
+ (53, 89),
+ (54, 90),
+ (55, 91),
+ (56, 92),
+ (57, 93),
+ (58, 94),
+ (59, 95),
+ (60, 96),
+ (61, 97),
+ (62, 98),
+ (63, 99),
+ (64, 100),
+ (65, 101),
+ (66, 102),
+ (67, 103)
+]
+
+mapping_halpe = [
+ #
+ (26, 0),
+ (27, 2),
+ (28, 4),
+ (29, 6),
+ (30, 8),
+ (31, 10),
+ (32, 12),
+ (33, 14),
+ (34, 16),
+ (35, 18),
+ (36, 20),
+ (37, 22),
+ (38, 24),
+ (39, 26),
+ (40, 28),
+ (41, 30),
+ (42, 32),
+ #
+ (43, 33),
+ (44, 34),
+ (45, 35),
+ (46, 36),
+ (47, 37),
+ #
+ (48, 42),
+ (49, 43),
+ (50, 44),
+ (51, 45),
+ (52, 46),
+ #
+ (53, 51),
+ (54, 52),
+ (55, 53),
+ (56, 54),
+ #
+ (57, 58),
+ (58, 59),
+ (59, 60),
+ (60, 61),
+ (61, 62),
+ #
+ (62, 66),
+ (65, 70),
+ #
+ ((63, 64), 68),
+ ((66, 67), 72),
+ #
+ (68, 75),
+ (71, 79),
+ #
+ ((69, 70), 77),
+ ((72, 73), 81),
+ #
+ (74, 84),
+ (75, 85),
+ (76, 86),
+ (77, 87),
+ (78, 88),
+ (79, 89),
+ (80, 90),
+ (81, 91),
+ (82, 92),
+ (83, 93),
+ (84, 94),
+ (85, 95),
+ (86, 96),
+ (87, 97),
+ (88, 98),
+ (89, 99),
+ (90, 100),
+ (91, 101),
+ (92, 102),
+ (93, 103)
+]
+
+mapping_wflw = [
+ #
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+ (17, 17),
+ (18, 18),
+ (19, 19),
+ (20, 20),
+ (21, 21),
+ (22, 22),
+ (23, 23),
+ (24, 24),
+ (25, 25),
+ (26, 26),
+ (27, 27),
+ (28, 28),
+ (29, 29),
+ (30, 30),
+ (31, 31),
+ (32, 32),
+ #
+ (33, 33),
+ (34, 34),
+ (35, 35),
+ (36, 36),
+ (37, 37),
+ (38, 38),
+ (39, 39),
+ (40, 40),
+ (41, 41),
+ #
+ (42, 42),
+ (43, 43),
+ (44, 44),
+ (45, 45),
+ (46, 46),
+ (47, 47),
+ (48, 48),
+ (49, 49),
+ (50, 50),
+ #
+ (51, 51),
+ (52, 52),
+ (53, 53),
+ (54, 54),
+ #
+ (55, 58),
+ (56, 59),
+ (57, 60),
+ (58, 61),
+ (59, 62),
+ #
+ (60, 66),
+ (61, 67),
+ (62, 68),
+ (63, 69),
+ (64, 70),
+ (65, 71),
+ (66, 72),
+ (67, 73),
+ #
+ (68, 75),
+ (69, 76),
+ (70, 77),
+ (71, 78),
+ (72, 79),
+ (73, 80),
+ (74, 81),
+ (75, 82),
+ #
+ (76, 84),
+ (77, 85),
+ (78, 86),
+ (79, 87),
+ (80, 88),
+ (81, 89),
+ (82, 90),
+ (83, 91),
+ (84, 92),
+ (85, 93),
+ (86, 94),
+ (87, 95),
+ (88, 96),
+ (89, 97),
+ (90, 98),
+ (91, 99),
+ (92, 100),
+ (93, 101),
+ (94, 102),
+ (95, 103),
+ #
+ (96, 104),
+ #
+ (97, 105)
+]
+
+mapping_cofw = [
+ #
+ (0, 33),
+ (2, 38),
+ (4, 35),
+ (5, 40),
+ #
+ (1, 46),
+ (3, 50),
+ (6, 44),
+ (7, 48),
+ #
+ (8, 60),
+ (10, 64),
+ (12, 62),
+ (13, 66),
+ #
+ (9, 72),
+ (11, 68),
+ (14, 70),
+ (15, 74),
+ #
+ (18, 57),
+ (19, 63),
+ (20, 54),
+ (21, 60),
+ #
+ (22, 84),
+ (23, 90),
+ (24, 87),
+ (25, 98),
+ (26, 102),
+ (27, 93),
+ #
+ (28, 16)
+]
+dataset_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_133kpt.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[
+ dataset_lapa, dataset_coco, dataset_wflw, dataset_300w,
+ dataset_cofw, dataset_halpe
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# test dataset
+val_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+val_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+val_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_test.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_test.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py
new file mode 100644
index 0000000..b18d19d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py
@@ -0,0 +1,691 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# lapa coco wflw 300w cofw halpe
+
+# runtime
+max_epochs = 120
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.005,
+ begin=30,
+ end=max_epochs,
+ T_max=max_epochs - 30,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth')
+ ),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=106,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'LapaDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.2),
+ dict(type='MedianBlur', p=0.2),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+# train dataset
+dataset_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+kpt_68_to_106 = [
+ #
+ (0, 0),
+ (1, 2),
+ (2, 4),
+ (3, 6),
+ (4, 8),
+ (5, 10),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 18),
+ (10, 20),
+ (11, 22),
+ (12, 24),
+ (13, 26),
+ (14, 28),
+ (15, 30),
+ (16, 32),
+ #
+ (17, 33),
+ (18, 34),
+ (19, 35),
+ (20, 36),
+ (21, 37),
+ #
+ (22, 42),
+ (23, 43),
+ (24, 44),
+ (25, 45),
+ (26, 46),
+ #
+ (27, 51),
+ (28, 52),
+ (29, 53),
+ (30, 54),
+ #
+ (31, 58),
+ (32, 59),
+ (33, 60),
+ (34, 61),
+ (35, 62),
+ #
+ (36, 66),
+ (39, 70),
+ #
+ ((37, 38), 68),
+ ((40, 41), 72),
+ #
+ (42, 75),
+ (45, 79),
+ #
+ ((43, 44), 77),
+ ((46, 47), 81),
+ #
+ (48, 84),
+ (49, 85),
+ (50, 86),
+ (51, 87),
+ (52, 88),
+ (53, 89),
+ (54, 90),
+ (55, 91),
+ (56, 92),
+ (57, 93),
+ (58, 94),
+ (59, 95),
+ (60, 96),
+ (61, 97),
+ (62, 98),
+ (63, 99),
+ (64, 100),
+ (65, 101),
+ (66, 102),
+ (67, 103)
+]
+
+mapping_halpe = [
+ #
+ (26, 0),
+ (27, 2),
+ (28, 4),
+ (29, 6),
+ (30, 8),
+ (31, 10),
+ (32, 12),
+ (33, 14),
+ (34, 16),
+ (35, 18),
+ (36, 20),
+ (37, 22),
+ (38, 24),
+ (39, 26),
+ (40, 28),
+ (41, 30),
+ (42, 32),
+ #
+ (43, 33),
+ (44, 34),
+ (45, 35),
+ (46, 36),
+ (47, 37),
+ #
+ (48, 42),
+ (49, 43),
+ (50, 44),
+ (51, 45),
+ (52, 46),
+ #
+ (53, 51),
+ (54, 52),
+ (55, 53),
+ (56, 54),
+ #
+ (57, 58),
+ (58, 59),
+ (59, 60),
+ (60, 61),
+ (61, 62),
+ #
+ (62, 66),
+ (65, 70),
+ #
+ ((63, 64), 68),
+ ((66, 67), 72),
+ #
+ (68, 75),
+ (71, 79),
+ #
+ ((69, 70), 77),
+ ((72, 73), 81),
+ #
+ (74, 84),
+ (75, 85),
+ (76, 86),
+ (77, 87),
+ (78, 88),
+ (79, 89),
+ (80, 90),
+ (81, 91),
+ (82, 92),
+ (83, 93),
+ (84, 94),
+ (85, 95),
+ (86, 96),
+ (87, 97),
+ (88, 98),
+ (89, 99),
+ (90, 100),
+ (91, 101),
+ (92, 102),
+ (93, 103)
+]
+
+mapping_wflw = [
+ #
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+ (17, 17),
+ (18, 18),
+ (19, 19),
+ (20, 20),
+ (21, 21),
+ (22, 22),
+ (23, 23),
+ (24, 24),
+ (25, 25),
+ (26, 26),
+ (27, 27),
+ (28, 28),
+ (29, 29),
+ (30, 30),
+ (31, 31),
+ (32, 32),
+ #
+ (33, 33),
+ (34, 34),
+ (35, 35),
+ (36, 36),
+ (37, 37),
+ (38, 38),
+ (39, 39),
+ (40, 40),
+ (41, 41),
+ #
+ (42, 42),
+ (43, 43),
+ (44, 44),
+ (45, 45),
+ (46, 46),
+ (47, 47),
+ (48, 48),
+ (49, 49),
+ (50, 50),
+ #
+ (51, 51),
+ (52, 52),
+ (53, 53),
+ (54, 54),
+ #
+ (55, 58),
+ (56, 59),
+ (57, 60),
+ (58, 61),
+ (59, 62),
+ #
+ (60, 66),
+ (61, 67),
+ (62, 68),
+ (63, 69),
+ (64, 70),
+ (65, 71),
+ (66, 72),
+ (67, 73),
+ #
+ (68, 75),
+ (69, 76),
+ (70, 77),
+ (71, 78),
+ (72, 79),
+ (73, 80),
+ (74, 81),
+ (75, 82),
+ #
+ (76, 84),
+ (77, 85),
+ (78, 86),
+ (79, 87),
+ (80, 88),
+ (81, 89),
+ (82, 90),
+ (83, 91),
+ (84, 92),
+ (85, 93),
+ (86, 94),
+ (87, 95),
+ (88, 96),
+ (89, 97),
+ (90, 98),
+ (91, 99),
+ (92, 100),
+ (93, 101),
+ (94, 102),
+ (95, 103),
+ #
+ (96, 104),
+ #
+ (97, 105)
+]
+
+mapping_cofw = [
+ #
+ (0, 33),
+ (2, 38),
+ (4, 35),
+ (5, 40),
+ #
+ (1, 46),
+ (3, 50),
+ (6, 44),
+ (7, 48),
+ #
+ (8, 60),
+ (10, 64),
+ (12, 62),
+ (13, 66),
+ #
+ (9, 72),
+ (11, 68),
+ (14, 70),
+ (15, 74),
+ #
+ (18, 57),
+ (19, 63),
+ (20, 54),
+ (21, 60),
+ #
+ (22, 84),
+ (23, 90),
+ (24, 87),
+ (25, 98),
+ (26, 102),
+ (27, 93),
+ #
+ (28, 16)
+]
+dataset_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_133kpt.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[
+ dataset_lapa, dataset_coco, dataset_wflw, dataset_300w,
+ dataset_cofw, dataset_halpe
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ pin_memory=True,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# test dataset
+val_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+val_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+val_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_test.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_test.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py
new file mode 100644
index 0000000..88e9517
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py
@@ -0,0 +1,689 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# lapa coco wflw 300w cofw halpe
+
+# runtime
+max_epochs = 120
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.005,
+ begin=30,
+ end=max_epochs,
+ T_max=90,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=106,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'LapaDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.2),
+ dict(type='MedianBlur', p=0.2),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+# train dataset
+dataset_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+kpt_68_to_106 = [
+ #
+ (0, 0),
+ (1, 2),
+ (2, 4),
+ (3, 6),
+ (4, 8),
+ (5, 10),
+ (6, 12),
+ (7, 14),
+ (8, 16),
+ (9, 18),
+ (10, 20),
+ (11, 22),
+ (12, 24),
+ (13, 26),
+ (14, 28),
+ (15, 30),
+ (16, 32),
+ #
+ (17, 33),
+ (18, 34),
+ (19, 35),
+ (20, 36),
+ (21, 37),
+ #
+ (22, 42),
+ (23, 43),
+ (24, 44),
+ (25, 45),
+ (26, 46),
+ #
+ (27, 51),
+ (28, 52),
+ (29, 53),
+ (30, 54),
+ #
+ (31, 58),
+ (32, 59),
+ (33, 60),
+ (34, 61),
+ (35, 62),
+ #
+ (36, 66),
+ (39, 70),
+ #
+ ((37, 38), 68),
+ ((40, 41), 72),
+ #
+ (42, 75),
+ (45, 79),
+ #
+ ((43, 44), 77),
+ ((46, 47), 81),
+ #
+ (48, 84),
+ (49, 85),
+ (50, 86),
+ (51, 87),
+ (52, 88),
+ (53, 89),
+ (54, 90),
+ (55, 91),
+ (56, 92),
+ (57, 93),
+ (58, 94),
+ (59, 95),
+ (60, 96),
+ (61, 97),
+ (62, 98),
+ (63, 99),
+ (64, 100),
+ (65, 101),
+ (66, 102),
+ (67, 103)
+]
+
+mapping_halpe = [
+ #
+ (26, 0),
+ (27, 2),
+ (28, 4),
+ (29, 6),
+ (30, 8),
+ (31, 10),
+ (32, 12),
+ (33, 14),
+ (34, 16),
+ (35, 18),
+ (36, 20),
+ (37, 22),
+ (38, 24),
+ (39, 26),
+ (40, 28),
+ (41, 30),
+ (42, 32),
+ #
+ (43, 33),
+ (44, 34),
+ (45, 35),
+ (46, 36),
+ (47, 37),
+ #
+ (48, 42),
+ (49, 43),
+ (50, 44),
+ (51, 45),
+ (52, 46),
+ #
+ (53, 51),
+ (54, 52),
+ (55, 53),
+ (56, 54),
+ #
+ (57, 58),
+ (58, 59),
+ (59, 60),
+ (60, 61),
+ (61, 62),
+ #
+ (62, 66),
+ (65, 70),
+ #
+ ((63, 64), 68),
+ ((66, 67), 72),
+ #
+ (68, 75),
+ (71, 79),
+ #
+ ((69, 70), 77),
+ ((72, 73), 81),
+ #
+ (74, 84),
+ (75, 85),
+ (76, 86),
+ (77, 87),
+ (78, 88),
+ (79, 89),
+ (80, 90),
+ (81, 91),
+ (82, 92),
+ (83, 93),
+ (84, 94),
+ (85, 95),
+ (86, 96),
+ (87, 97),
+ (88, 98),
+ (89, 99),
+ (90, 100),
+ (91, 101),
+ (92, 102),
+ (93, 103)
+]
+
+mapping_wflw = [
+ #
+ (0, 0),
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+ (17, 17),
+ (18, 18),
+ (19, 19),
+ (20, 20),
+ (21, 21),
+ (22, 22),
+ (23, 23),
+ (24, 24),
+ (25, 25),
+ (26, 26),
+ (27, 27),
+ (28, 28),
+ (29, 29),
+ (30, 30),
+ (31, 31),
+ (32, 32),
+ #
+ (33, 33),
+ (34, 34),
+ (35, 35),
+ (36, 36),
+ (37, 37),
+ (38, 38),
+ (39, 39),
+ (40, 40),
+ (41, 41),
+ #
+ (42, 42),
+ (43, 43),
+ (44, 44),
+ (45, 45),
+ (46, 46),
+ (47, 47),
+ (48, 48),
+ (49, 49),
+ (50, 50),
+ #
+ (51, 51),
+ (52, 52),
+ (53, 53),
+ (54, 54),
+ #
+ (55, 58),
+ (56, 59),
+ (57, 60),
+ (58, 61),
+ (59, 62),
+ #
+ (60, 66),
+ (61, 67),
+ (62, 68),
+ (63, 69),
+ (64, 70),
+ (65, 71),
+ (66, 72),
+ (67, 73),
+ #
+ (68, 75),
+ (69, 76),
+ (70, 77),
+ (71, 78),
+ (72, 79),
+ (73, 80),
+ (74, 81),
+ (75, 82),
+ #
+ (76, 84),
+ (77, 85),
+ (78, 86),
+ (79, 87),
+ (80, 88),
+ (81, 89),
+ (82, 90),
+ (83, 91),
+ (84, 92),
+ (85, 93),
+ (86, 94),
+ (87, 95),
+ (88, 96),
+ (89, 97),
+ (90, 98),
+ (91, 99),
+ (92, 100),
+ (93, 101),
+ (94, 102),
+ (95, 103),
+ #
+ (96, 104),
+ #
+ (97, 105)
+]
+
+mapping_cofw = [
+ #
+ (0, 33),
+ (2, 38),
+ (4, 35),
+ (5, 40),
+ #
+ (1, 46),
+ (3, 50),
+ (6, 44),
+ (7, 48),
+ #
+ (8, 60),
+ (10, 64),
+ (12, 62),
+ (13, 66),
+ #
+ (9, 72),
+ (11, 68),
+ (14, 70),
+ (15, 74),
+ #
+ (18, 57),
+ (19, 63),
+ (20, 54),
+ (21, 60),
+ #
+ (22, 84),
+ (23, 90),
+ (24, 87),
+ (25, 98),
+ (26, 102),
+ (27, 93),
+ #
+ (28, 16)
+]
+dataset_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_133kpt.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[
+ dataset_lapa, dataset_coco, dataset_wflw, dataset_300w,
+ dataset_cofw, dataset_halpe
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# test dataset
+val_lapa = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_test.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[],
+)
+
+val_coco = dict(
+ type='CocoWholeBodyFaceDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_wflw)
+ ],
+)
+
+val_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_test.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=kpt_68_to_106)
+ ],
+)
+
+val_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_test.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_cofw)
+ ],
+)
+
+val_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter', num_keypoints=106, mapping=mapping_halpe)
+ ],
+)
+
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/lapa.py'),
+ datasets=[val_lapa, val_coco, val_wflw, val_300w, val_cofw, val_halpe],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ # dict(
+ # type='EMAHook',
+ # ema_type='ExpMomentumEMA',
+ # momentum=0.0002,
+ # update_buffers=True,
+ # priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md
new file mode 100644
index 0000000..f8f3764
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.md
@@ -0,0 +1,71 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+- Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset.
+- `Face6` and `*` denote model trained on 6 public datasets:
+ - [COCO-Wholebody-Face](https://github.com/jin-s13/COCO-WholeBody/)
+ - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html)
+ - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+ - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/)
+ - [Halpe](https://github.com/Fang-Haoshu/Halpe-FullBody/)
+ - [LaPa](https://github.com/JDAI-CV/lapa-dataset)
+
+| Config | Input Size | NME
(LaPa) | FLOPS
(G) | Download |
+| :--------------------------------------------------------------------------: | :--------: | :----------------: | :---------------: | :-----------------------------------------------------------------------------: |
+| [RTMPose-t\*](./rtmpose/face_2d_keypoint/rtmpose-t_8xb256-120e_face6-256x256.py) | 256x256 | 1.67 | 0.652 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-face6_pt-in1k_120e-256x256-df79d9a5_20230529.pth) |
+| [RTMPose-s\*](./rtmpose/face_2d_keypoint/rtmpose-s_8xb256-120e_face6-256x256.py) | 256x256 | 1.59 | 1.119 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-face6_pt-in1k_120e-256x256-d779fdef_20230529.pth) |
+| [RTMPose-m\*](./rtmpose/face_2d_keypoint/rtmpose-m_8xb256-120e_face6-256x256.py) | 256x256 | 1.44 | 2.852 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-face6_pt-in1k_120e-256x256-72a37400_20230529.pth) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml
new file mode 100644
index 0000000..a9eb2fc
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/face6/rtmpose_face6.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-t_8xb256-120e_face6-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: &id002
+ - COCO-Wholebody-Face
+ - WFLW
+ - 300W
+ - COFW
+ - Halpe
+ - LaPa
+ Name: rtmpose-t_8xb256-120e_face6-256x256
+ Results:
+ - Dataset: Face6
+ Metrics:
+ NME: 1.67
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-face6_pt-in1k_120e-256x256-df79d9a5_20230529.pth
+- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-s_8xb256-120e_face6-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-s_8xb256-120e_face6-256x256
+ Results:
+ - Dataset: Face6
+ Metrics:
+ NME: 1.59
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-face6_pt-in1k_120e-256x256-d779fdef_20230529.pth
+- Config: configs/face_2d_keypoint/rtmpose/face6/rtmpose-m_8xb256-120e_face6-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: *id002
+ Name: rtmpose-m_8xb256-120e_face6-256x256
+ Alias: face
+ Results:
+ - Dataset: Face6
+ Metrics:
+ NME: 1.44
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-face6_pt-in1k_120e-256x256-72a37400_20230529.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py
new file mode 100644
index 0000000..8e43b73
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py
@@ -0,0 +1,246 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 120
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=106,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'LapaDataset'
+data_mode = 'topdown'
+data_root = 'data/LaPa/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/pose/LaPa/',
+# f'{data_root}': 's3://openmmlab/datasets/pose/LaPa/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.2),
+ dict(type='MedianBlur', p=0.2),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/lapa_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/lapa_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/lapa_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md
new file mode 100644
index 0000000..837d3fd
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.md
@@ -0,0 +1,40 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+LaPa (AAAI'2020)
+
+```bibtex
+@inproceedings{liu2020new,
+ title={A New Dataset and Boundary-Attention Semantic Segmentation for Face Parsing.},
+ author={Liu, Yinglu and Shi, Hailin and Shen, Hao and Si, Yue and Wang, Xiaobo and Mei, Tao},
+ booktitle={AAAI},
+ pages={11637--11644},
+ year={2020}
+}
+```
+
+
+
+Results on LaPa val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: |
+| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py) | 256x256 | 1.29 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml
new file mode 100644
index 0000000..9f4cf04
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/lapa/rtmpose_lapa.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/rtmpose/lapa/rtmpose-m_8xb64-120e_lapa-256x256.py
+ In Collection: RTMPose
+ Alias: face
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: LaPa
+ Name: rtmpose-m_8xb64-120e_lapa-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME: 1.29
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-lapa_pt-aic-coco_120e-256x256-762b1ae2_20230422.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py
new file mode 100644
index 0000000..833235d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py
@@ -0,0 +1,231 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 60
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=98,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/',
+# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='NME', rule='less', max_keep_ckpts=1, interval=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md
new file mode 100644
index 0000000..30554f7
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md
@@ -0,0 +1,42 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: |
+| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py) | 256x256 | 4.01 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml
new file mode 100644
index 0000000..7ec6a7f
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.yml
@@ -0,0 +1,14 @@
+Models:
+- Config: configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: WFLW
+ Name: rtmpose-m_8xb64-60e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME: 4.01
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md
new file mode 100644
index 0000000..8da5476
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.md
@@ -0,0 +1,44 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+300W (IMAVIS'2016)
+
+```bibtex
+@article{sagonas2016300,
+ title={300 faces in-the-wild challenge: Database and results},
+ author={Sagonas, Christos and Antonakos, Epameinondas and Tzimiropoulos, Georgios and Zafeiriou, Stefanos and Pantic, Maja},
+ journal={Image and vision computing},
+ volume={47},
+ pages={3--18},
+ year={2016},
+ publisher={Elsevier}
+}
+```
+
+
+
+Results on 300W dataset
+
+The model is trained on 300W train.
+
+| Arch | Input Size | NME*common* | NME*challenge* | NME*full* | NME*test* | ckpt | log |
+| :--------------------------------- | :--------: | :--------------------: | :-----------------------: | :------------------: | :------------------: | :---------------------------------: | :--------------------------------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py) | 256x256 | 2.92 | 5.64 | 3.45 | 4.10 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256_20211019.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml
new file mode 100644
index 0000000..4a813d1
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/hrnetv2_300w.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: HRNetv2
+ Paper:
+ Title: Deep High-Resolution Representation Learning for Visual Recognition
+ URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: 300W
+ Name: td-hm_hrnetv2-w18_8xb64-60e_300w-256x256
+ Results:
+ - Dataset: 300W
+ Metrics:
+ NME challenge: 5.64
+ NME common: 2.92
+ NME full: 3.45
+ NME test: 4.1
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_300w_256x256-eea53406_20211019.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py
new file mode 100644
index 0000000..7f279f0
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py
@@ -0,0 +1,161 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=1.5)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=68,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Face300WDataset'
+data_mode = 'topdown'
+data_root = 'data/300w/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_300w_valid.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md
new file mode 100644
index 0000000..02c4bb7
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.md
@@ -0,0 +1,42 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+300WLP (IEEE'2017)
+
+```bibtex
+@article{zhu2017face,
+ title={Face alignment in full pose range: A 3d total solution},
+ author={Zhu, Xiangyu and Liu, Xiaoming and Lei, Zhen and Li, Stan Z},
+ journal={IEEE transactions on pattern analysis and machine intelligence},
+ year={2017},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on 300W-LP dataset
+
+The model is trained on 300W-LP train.
+
+| Arch | Input Size | NME*full* | NME*test* | ckpt | log |
+| :------------------------------------------------- | :--------: | :------------------: | :------------------: | :------------------------------------------------: | :------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py) | 256x256 | 0.0413 | 0.04125 | [ckpt](https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.pth) | [log](https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml
new file mode 100644
index 0000000..8a63761
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_300wlp.yml
@@ -0,0 +1,20 @@
+Collections:
+- Name: HRNetv2
+ Paper:
+ Title: Deep High-Resolution Representation Learning for Visual Recognition
+ URL: https://ieeexplore.ieee.org/abstract/document/9052469/
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/hrnetv2.md
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: 300W-LP
+ Name: td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256
+ Results:
+ - Dataset: 300W-LP
+ Metrics:
+ NME full: 0.0413
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/face_2d_keypoint/topdown_heatmap/300wlp/hrnetv2_w18_300wlp_256x256-fb433d21_20230922.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py
new file mode 100644
index 0000000..b4b5f61
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/300wlp/td-hm_hrnetv2-w18_8xb64-60e_300wlp-256x256.py
@@ -0,0 +1,160 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=1.5)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=68,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Face300WLPDataset'
+data_mode = 'topdown'
+data_root = 'data/300wlp/'
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=2,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_300wlp_train.json',
+ data_prefix=dict(img='train/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_300wlp_valid.json',
+ data_prefix=dict(img='val/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..53fd36d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,57 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### 300W Dataset
+
+Results on 300W dataset
+
+| Model | Input Size | NME*common* | NME*challenge* | NME*full* | NME*test* | Details and Download |
+| :---------: | :--------: | :--------------------: | :-----------------------: | :------------------: | :------------------: | :---------------------------------------: |
+| HRNetv2-w18 | 256x256 | 2.92 | 5.64 | 3.45 | 4.10 | [hrnetv2_300w.md](./300w/hrnetv2_300w.md) |
+
+### AFLW Dataset
+
+Results on AFLW dataset
+
+| Model | Input Size | NME*full* | NME*frontal* | Details and Download |
+| :--------------: | :--------: | :------------------: | :---------------------: | :-------------------------------------------------: |
+| HRNetv2-w18+Dark | 256x256 | 1.35 | 1.19 | [hrnetv2_dark_aflw.md](./aflw/hrnetv2_dark_aflw.md) |
+| HRNetv2-w18 | 256x256 | 1.41 | 1.27 | [hrnetv2_aflw.md](./aflw/hrnetv2_aflw.md) |
+
+### COCO-WholeBody-Face Dataset
+
+Results on COCO-WholeBody-Face val set
+
+| Model | Input Size | NME | Details and Download |
+| :--------------: | :--------: | :----: | :----------------------------------------------------------------------------------------------: |
+| HRNetv2-w18+Dark | 256x256 | 0.0513 | [hrnetv2_dark_coco_wholebody_face.md](./coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md) |
+| SCNet-50 | 256x256 | 0.0567 | [scnet_coco_wholebody_face.md](./coco_wholebody_face/scnet_coco_wholebody_face.md) |
+| HRNetv2-w18 | 256x256 | 0.0569 | [hrnetv2_coco_wholebody_face.md](./coco_wholebody_face/hrnetv2_coco_wholebody_face.md) |
+| ResNet-50 | 256x256 | 0.0582 | [resnet_coco_wholebody_face.md](./coco_wholebody_face/resnet_coco_wholebody_face.md) |
+| HourglassNet | 256x256 | 0.0587 | [hourglass_coco_wholebody_face.md](./coco_wholebody_face/hourglass_coco_wholebody_face.md) |
+| MobileNet-v2 | 256x256 | 0.0611 | [mobilenetv2_coco_wholebody_face.md](./coco_wholebody_face/mobilenetv2_coco_wholebody_face.md) |
+
+### COFW Dataset
+
+Results on COFW dataset
+
+| Model | Input Size | NME | Details and Download |
+| :---------: | :--------: | :--: | :---------------------------------------: |
+| HRNetv2-w18 | 256x256 | 3.48 | [hrnetv2_cofw.md](./cofw/hrnetv2_cofw.md) |
+
+### WFLW Dataset
+
+Results on WFLW dataset
+
+| Model | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | Details and Download |
+| :-----: | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------------------: |
+| HRNetv2-w18+Dark | 256x256 | 3.98 | 6.98 | 3.96 | 4.78 | 4.56 | 3.89 | 4.29 | [hrnetv2_dark_wflw.md](./wflw/hrnetv2_dark_wflw.md) |
+| HRNetv2-w18+AWing | 256x256 | 4.02 | 6.94 | 3.97 | 4.78 | 4.59 | 3.87 | 4.28 | [hrnetv2_awing_wflw.md](./wflw/hrnetv2_awing_wflw.md) |
+| HRNetv2-w18 | 256x256 | 4.06 | 6.97 | 3.99 | 4.83 | 4.58 | 3.94 | 4.33 | [hrnetv2_wflw.md](./wflw/hrnetv2_wflw.md) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md
new file mode 100644
index 0000000..36aade2
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.md
@@ -0,0 +1,43 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+AFLW (ICCVW'2011)
+
+```bibtex
+@inproceedings{koestinger2011annotated,
+ title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
+ author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
+ booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
+ pages={2144--2151},
+ year={2011},
+ organization={IEEE}
+}
+```
+
+
+
+Results on AFLW dataset
+
+The model is trained on AFLW train and evaluated on AFLW full and frontal.
+
+| Arch | Input Size | NME*full* | NME*frontal* | ckpt | log |
+| :------------------------------------------------ | :--------: | :------------------: | :---------------------: | :-----------------------------------------------: | :-----------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py) | 256x256 | 1.41 | 1.27 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256_20210125.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml
new file mode 100644
index 0000000..ce0bdcc
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_aflw.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: AFLW
+ Name: td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256
+ Results:
+ - Dataset: AFLW
+ Metrics:
+ NME frontal: 1.27
+ NME full: 1.41
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md
new file mode 100644
index 0000000..fc4f25e
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.md
@@ -0,0 +1,60 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+AFLW (ICCVW'2011)
+
+```bibtex
+@inproceedings{koestinger2011annotated,
+ title={Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization},
+ author={Koestinger, Martin and Wohlhart, Paul and Roth, Peter M and Bischof, Horst},
+ booktitle={2011 IEEE international conference on computer vision workshops (ICCV workshops)},
+ pages={2144--2151},
+ year={2011},
+ organization={IEEE}
+}
+```
+
+
+
+Results on AFLW dataset
+
+The model is trained on AFLW train and evaluated on AFLW full and frontal.
+
+| Arch | Input Size | NME*full* | NME*frontal* | ckpt | log |
+| :------------------------------------------------ | :--------: | :------------------: | :---------------------: | :-----------------------------------------------: | :-----------------------------------------------: |
+| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py) | 256x256 | 1.35 | 1.19 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark_20210125.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
new file mode 100644
index 0000000..955adb6
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: AFLW
+ Name: td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256
+ Results:
+ - Dataset: AFLW
+ Metrics:
+ NME frontal: 1.19
+ NME full: 1.34
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_aflw_256x256_dark-219606c0_20210125.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py
new file mode 100644
index 0000000..50d197b
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py
@@ -0,0 +1,156 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=19,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/aflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_aflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_aflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME', norm_mode='use_norm_item', norm_item='bbox_size')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py
new file mode 100644
index 0000000..335cd34
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py
@@ -0,0 +1,160 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=19,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'AFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/aflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_aflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_aflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME', norm_mode='use_norm_item', norm_item='bbox_size')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md
new file mode 100644
index 0000000..26f08da
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.md
@@ -0,0 +1,39 @@
+
+
+
+Hourglass (ECCV'2016)
+
+```bibtex
+@inproceedings{newell2016stacked,
+ title={Stacked hourglass networks for human pose estimation},
+ author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+ booktitle={European conference on computer vision},
+ pages={483--499},
+ year={2016},
+ organization={Springer}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_hourglass_52](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0587 | [ckpt](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
new file mode 100644
index 0000000..185474f
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
@@ -0,0 +1,14 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture:
+ - Hourglass
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0587
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hourglass/hourglass52_coco_wholebody_face_256x256-6994cf2e_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md
new file mode 100644
index 0000000..3cf9109
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.md
@@ -0,0 +1,39 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0569 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
new file mode 100644
index 0000000..e7e526d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
@@ -0,0 +1,14 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0569
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_coco_wholebody_face_256x256-c1ca469b_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md
new file mode 100644
index 0000000..60914db
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.md
@@ -0,0 +1,56 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0513 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
new file mode 100644
index 0000000..ca0cefd
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0513
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_coco_wholebody_face_256x256_dark-3d9a334e_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md
new file mode 100644
index 0000000..a520407
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.md
@@ -0,0 +1,38 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_mobilenetv2](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0611 | [ckpt](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
new file mode 100644
index 0000000..6d80729
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0611
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/mobilenetv2/mobilenetv2_coco_wholebody_face_256x256-4a3f096e_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md
new file mode 100644
index 0000000..296588c
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.md
@@ -0,0 +1,55 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_res50](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0582 | [ckpt](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
new file mode 100644
index 0000000..c63e04b
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_res50_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0582
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/resnet/res50_coco_wholebody_face_256x256-5128edf5_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md
new file mode 100644
index 0000000..368b16b
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.md
@@ -0,0 +1,38 @@
+
+
+
+SCNet (CVPR'2020)
+
+```bibtex
+@inproceedings{liu2020improving,
+ title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+ author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={10096--10105},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Face (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Face val set
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [pose_scnet_50](/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0567 | [ckpt](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth) | [log](https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
new file mode 100644
index 0000000..d0fde1e
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
@@ -0,0 +1,15 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - SCNet
+ Training Data: COCO-WholeBody-Face
+ Name: td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Face
+ Metrics:
+ NME: 0.0567
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/scnet/scnet50_coco_wholebody_face_256x256-a0183f5f_20210909.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..135a45f
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=68,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..b751fae
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,156 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=68,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..a31e599
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,160 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=68,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..c4a314d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=68,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..7b4dcad
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=68,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py
new file mode 100644
index 0000000..62b7885
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet50-7ef0a199.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=68,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyFaceDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md
new file mode 100644
index 0000000..4828f2c
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.md
@@ -0,0 +1,42 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COFW (ICCV'2013)
+
+```bibtex
+@inproceedings{burgos2013robust,
+ title={Robust face landmark estimation under occlusion},
+ author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr},
+ booktitle={Proceedings of the IEEE international conference on computer vision},
+ pages={1513--1520},
+ year={2013}
+}
+```
+
+
+
+Results on COFW dataset
+
+The model is trained on COFW train.
+
+| Arch | Input Size | NME | ckpt | log |
+| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py) | 256x256 | 3.48 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256_20211019.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml
new file mode 100644
index 0000000..749e348
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/hrnetv2_cofw.yml
@@ -0,0 +1,14 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: COFW
+ Name: td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256
+ Results:
+ - Dataset: COFW
+ Metrics:
+ NME: 3.48
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_cofw_256x256-49243ab8_20211019.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py
new file mode 100644
index 0000000..ee59f3d
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py
@@ -0,0 +1,161 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=50,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=1.5)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=29,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'COFWDataset'
+data_mode = 'topdown'
+data_root = 'data/cofw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/cofw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/cofw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md
new file mode 100644
index 0000000..4df239a
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.md
@@ -0,0 +1,59 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+AdaptiveWingloss (ICCV'2019)
+
+```bibtex
+@inproceedings{wang2019adaptive,
+ title={Adaptive wing loss for robust face alignment via heatmap regression},
+ author={Wang, Xinyao and Bo, Liefeng and Fuxin, Li},
+ booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
+ pages={6971--6981},
+ year={2019}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log |
+| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: |
+| [pose_hrnetv2_w18_awing](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py) | 256x256 | 4.02 | 6.94 | 3.97 | 4.78 | 4.59 | 3.87 | 4.28 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing_20211212.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
new file mode 100644
index 0000000..6a6d46a
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
@@ -0,0 +1,21 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - AdaptiveWingloss
+ Training Data: WFLW
+ Name: td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME blur: 4.59
+ NME expression: 4.28
+ NME illumination: 3.97
+ NME makeup: 3.87
+ NME occlusion: 4.78
+ NME pose: 6.94
+ NME test: 4.02
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_awing-5af5055c_20211212.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md
new file mode 100644
index 0000000..b36477b
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.md
@@ -0,0 +1,59 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log |
+| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: |
+| [pose_hrnetv2_w18_dark](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py) | 256x256 | 3.98 | 6.98 | 3.96 | 4.78 | 4.56 | 3.89 | 4.29 | [ckpt](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark_20210125.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
new file mode 100644
index 0000000..303be33
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
@@ -0,0 +1,21 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: WFLW
+ Name: td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME blur: 4.56
+ NME expression: 4.29
+ NME illumination: 3.96
+ NME makeup: 3.89
+ NME occlusion: 4.78
+ NME pose: 6.98
+ NME test: 3.98
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/darkpose/hrnetv2_w18_wflw_256x256_dark-3f8e0c2c_20210125.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md
new file mode 100644
index 0000000..121f993
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.md
@@ -0,0 +1,42 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train.
+
+| Arch | Input Size | NME*test* | NME*pose* | NME*illumination* | NME*occlusion* | NME*blur* | NME*makeup* | NME*expression* | ckpt | log |
+| :--------- | :--------: | :------------------: | :------------------: | :--------------------------: | :-----------------------: | :------------------: | :--------------------: | :------------------------: | :--------: | :-------: |
+| [pose_hrnetv2_w18](/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py) | 256x256 | 4.06 | 6.97 | 3.99 | 4.83 | 4.58 | 3.94 | 4.33 | [ckpt](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth) | [log](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256_20210125.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml
new file mode 100644
index 0000000..2d188c3
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml
@@ -0,0 +1,20 @@
+Models:
+- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: WFLW
+ Name: td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME blur: 4.58
+ NME expression: 4.33
+ NME illumination: 3.99
+ NME makeup: 3.94
+ NME occlusion: 4.83
+ NME pose: 6.97
+ NME test: 4.06
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py
new file mode 100644
index 0000000..507035c
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=98,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py
new file mode 100644
index 0000000..f6885da
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=98,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='AdaptiveWingLoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py
new file mode 100644
index 0000000..e1a47c7
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=2e-3,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=60,
+ milestones=[40, 55],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18'),
+ ),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=98,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_prob=0,
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md
new file mode 100644
index 0000000..c4b1cb4
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/README.md
@@ -0,0 +1,19 @@
+# Top-down regression-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html).
+
+
+

+
+
+## Results and Models
+
+### WFLW Dataset
+
+Result on WFLW test set
+
+| Model | Input Size | NME | ckpt | log |
+| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [ResNet-50](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py) | 256x256 | 4.88 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_20210303.log.json) |
+| [ResNet-50+WingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.67 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss_20210303.log.json) |
+| [ResNet-50+SoftWingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.44 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss_20211212.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md
new file mode 100644
index 0000000..f36b939
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.md
@@ -0,0 +1,75 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+SoftWingloss (TIP'2021)
+
+```bibtex
+@article{lin2021structure,
+ title={Structure-Coherent Deep Feature Learning for Robust Face Alignment},
+ author={Lin, Chunze and Zhu, Beier and Wang, Quan and Liao, Renjie and Qian, Chen and Lu, Jiwen and Zhou, Jie},
+ journal={IEEE Transactions on Image Processing},
+ year={2021},
+ publisher={IEEE}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train set.
+
+| Model | Input Size | NME | ckpt | log |
+| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [ResNet-50+SoftWingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.44 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss_20211212.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml
new file mode 100644
index 0000000..9e33fe2
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_softwingloss_wflw.yml
@@ -0,0 +1,22 @@
+Collections:
+- Name: SoftWingloss
+ Paper:
+ Title: Structure-Coherent Deep Feature Learning for Robust Face Alignment
+ URL: https://ieeexplore.ieee.org/document/9442331/
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/softwingloss.md
+Models:
+- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py
+ In Collection: SoftWingloss
+ Metadata:
+ Architecture:
+ - DeepPose
+ - ResNet
+ - SoftWingloss
+ Training Data: WFLW
+ Name: td-reg_res50_softwingloss_8xb64-210e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME: 4.44
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_softwingloss-4d34f22a_20211212.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md
new file mode 100644
index 0000000..f605688
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.md
@@ -0,0 +1,58 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train set.
+
+| Model | Input Size | NME | ckpt | log |
+| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [ResNet-50](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py) | 256x256 | 4.88 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_20210303.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml
new file mode 100644
index 0000000..c86c208
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wflw.yml
@@ -0,0 +1,21 @@
+Collections:
+- Name: ResNet
+ Paper:
+ Title: Deep residual learning for image recognition
+ URL: http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/backbones/resnet.md
+Models:
+- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py
+ In Collection: ResNet
+ Metadata:
+ Architecture:
+ - DeepPose
+ - ResNet
+ Training Data: WFLW
+ Name: td-reg_res50_8x64e-210e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME: 4.88
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256-92d0ba7f_20210303.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md
new file mode 100644
index 0000000..5dc9adc
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.md
@@ -0,0 +1,76 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+Wingloss (CVPR'2018)
+
+```bibtex
+@inproceedings{feng2018wing,
+ title={Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks},
+ author={Feng, Zhen-Hua and Kittler, Josef and Awais, Muhammad and Huber, Patrik and Wu, Xiao-Jun},
+ booktitle={Computer Vision and Pattern Recognition (CVPR), 2018 IEEE Conference on},
+ year={2018},
+ pages ={2235-2245},
+ organization={IEEE}
+}
+```
+
+
+
+
+
+
+WFLW (CVPR'2018)
+
+```bibtex
+@inproceedings{wu2018look,
+ title={Look at boundary: A boundary-aware face alignment algorithm},
+ author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={2129--2138},
+ year={2018}
+}
+```
+
+
+
+Results on WFLW dataset
+
+The model is trained on WFLW train set.
+
+| Model | Input Size | NME | ckpt | log |
+| :-------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :-----------------------------------------------------------: |
+| [ResNet-50+WingLoss](/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py) | 256x256 | 4.67 | [ckpt](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth) | [log](https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss_20210303.log.json) |
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml
new file mode 100644
index 0000000..327f07e
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/resnet_wingloss_wflw.yml
@@ -0,0 +1,23 @@
+Collections:
+- Name: Wingloss
+ Paper:
+ Title: Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural
+ Networks
+ URL: http://openaccess.thecvf.com/content_cvpr_2018/html/Feng_Wing_Loss_for_CVPR_2018_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/techniques/wingloss.md
+Models:
+- Config: configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py
+ In Collection: Wingloss
+ Metadata:
+ Architecture:
+ - DeepPose
+ - ResNet
+ - WingLoss
+ Training Data: WFLW
+ Name: td-reg_res50_wingloss_8xb64-210e_wflw-256x256
+ Results:
+ - Dataset: WFLW
+ Metrics:
+ NME: 4.67
+ Task: Face 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/face/deeppose/deeppose_res50_wflw_256x256_wingloss-f82a5e53_20210303.pth
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py
new file mode 100644
index 0000000..dd9ade7
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_8xb64-210e_wflw-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=98,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ train_cfg=dict(),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# dataloaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less'))
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py
new file mode 100644
index 0000000..beae1bf
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_softwingloss_8xb64-210e_wflw-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=98,
+ loss=dict(type='SoftWingLoss', use_target_weight=True),
+ decoder=codec),
+ train_cfg=dict(),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# dataloaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less'))
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py
new file mode 100644
index 0000000..2f625e6
--- /dev/null
+++ b/modules/rtmpose/configs/face_2d_keypoint/topdown_regression/wflw/td-reg_res50_wingloss_8xb64-210e_wflw-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=98,
+ loss=dict(type='WingLoss', use_target_weight=True),
+ decoder=codec),
+ train_cfg=dict(),
+ test_cfg=dict(
+ flip_test=True,
+ shift_coords=True,
+ ))
+
+# base dataset settings
+dataset_type = 'WFLWDataset'
+data_mode = 'topdown'
+data_root = 'data/wflw/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# dataloaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='images/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/face_landmarks_wflw_test.json',
+ data_prefix=dict(img='images/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='NME', rule='less'))
+
+# evaluators
+val_evaluator = dict(
+ type='NME',
+ norm_mode='keypoint_distance',
+)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/README.md b/modules/rtmpose/configs/fashion_2d_keypoint/README.md
new file mode 100644
index 0000000..f4ec40f
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/README.md
@@ -0,0 +1,7 @@
+# 2D Fashion Landmark Detection
+
+2D fashion landmark detection (also referred to as fashion alignment) aims to detect the key-point located at the functional region of clothes, for example the neckline and the cuff.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_fashion_landmark.md) to prepare data.
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..94e47b7
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,42 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### DeepFashion Dataset
+
+Results on DeepFashion dataset with ResNet backbones:
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-----------------: | :--------: | :-----: | :--: | :--: | :----------------------------------------------------------: |
+| HRNet-w48-UDP-Upper | 256x192 | 96.1 | 60.9 | 15.1 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) |
+| HRNet-w48-UDP-Lower | 256x192 | 97.8 | 76.1 | 8.9 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) |
+| HRNet-w48-UDP-Full | 256x192 | 98.3 | 67.3 | 11.7 | [hrnet_deepfashion.md](./deepfashion/hrnet_deepfashion.md) |
+| ResNet-50-Upper | 256x192 | 95.4 | 57.8 | 16.8 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) |
+| ResNet-50-Lower | 256x192 | 96.5 | 74.4 | 10.5 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) |
+| ResNet-50-Full | 256x192 | 97.7 | 66.4 | 12.7 | [resnet_deepfashion.md](./deepfashion/resnet_deepfashion.md) |
+
+### DeepFashion2 Dataset
+
+Results on DeepFashion2 dataset
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-----------------------------: | :--------: | :-----: | :---: | :--: | :-----------------------------------------------------------: |
+| ResNet-50-Short-Sleeved-Shirt | 256x192 | 0.988 | 0.703 | 10.2 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Long-Sleeved-Shirt | 256x192 | 0.973 | 0.587 | 16.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Short-Sleeved-Outwear | 256x192 | 0.966 | 0.408 | 24.0 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Long-Sleeved-Outwear | 256x192 | 0.987 | 0.517 | 18.1 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Vest | 256x192 | 0.981 | 0.643 | 12.7 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Sling | 256x192 | 0.940 | 0.557 | 21.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Shorts | 256x192 | 0.975 | 0.682 | 12.4 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Trousers | 256x192 | 0.973 | 0.625 | 14.8 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Skirt | 256x192 | 0.952 | 0.653 | 16.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Short-Sleeved-Dress | 256x192 | 0.980 | 0.603 | 15.6 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Long-Sleeved-Dress | 256x192 | 0.976 | 0.518 | 20.1 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Vest-Dress | 256x192 | 0.980 | 0.600 | 16.0 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
+| ResNet-50-Sling-Dress | 256x192 | 0.967 | 0.544 | 19.5 | [res50_deepfashion2.md](./deepfashion2/res50_deepfashion2.md) |
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md
new file mode 100644
index 0000000..ece7c38
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.md
@@ -0,0 +1,77 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+DeepFashion (CVPR'2016)
+
+```bibtex
+@inproceedings{liuLQWTcvpr16DeepFashion,
+ author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
+ title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
+ booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2016}
+}
+```
+
+
+
+
+
+
+DeepFashion (ECCV'2016)
+
+```bibtex
+@inproceedings{liuYLWTeccv16FashionLandmark,
+ author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
+ title = {Fashion Landmark Detection in the Wild},
+ booktitle = {European Conference on Computer Vision (ECCV)},
+ month = {October},
+ year = {2016}
+ }
+```
+
+
+
+Results on DeepFashion val set
+
+| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :---- | :-------------------------------------------------------: | :--------: | :-----: | :--: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: |
+| upper | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_uppder-256x192.py) | 256x192 | 96.1 | 60.9 | 15.1 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.log) |
+| lower | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py) | 256x192 | 97.8 | 76.1 | 8.9 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.log) |
+| full | [pose_hrnet_w48_udp](td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py) | 256x192 | 98.3 | 67.3 | 11.7 | [ckpt](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.pth) | [log](https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.log) |
+
+Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml
new file mode 100644
index 0000000..ccca1d3
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/hrnet_deepfashion.yml
@@ -0,0 +1,45 @@
+Models:
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ - UDP
+ Training Data: DeepFashion
+ Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 76.1
+ EPE: 8.9
+ PCK@0.2: 97.8
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192-ddaf747d_20230810.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion
+ Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 60.9
+ EPE: 15.1
+ PCK@0.2: 96.1
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192-de7c0eb1_20230810.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion
+ Name: td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 67.3
+ EPE: 11.7
+ PCK@0.2: 98.3
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192-7ab504c7_20230810.pth
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md
new file mode 100644
index 0000000..4475ebc
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.md
@@ -0,0 +1,77 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+DeepFashion (CVPR'2016)
+
+```bibtex
+@inproceedings{liuLQWTcvpr16DeepFashion,
+ author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou},
+ title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations},
+ booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2016}
+}
+```
+
+
+
+
+
+
+DeepFashion (ECCV'2016)
+
+```bibtex
+@inproceedings{liuYLWTeccv16FashionLandmark,
+ author = {Liu, Ziwei and Yan, Sijie and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
+ title = {Fashion Landmark Detection in the Wild},
+ booktitle = {European Conference on Computer Vision (ECCV)},
+ month = {October},
+ year = {2016}
+ }
+```
+
+
+
+Results on DeepFashion val set
+
+| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :---- | :-------------------------------------------------------: | :--------: | :-----: | :--: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: |
+| upper | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py) | 256x192 | 95.4 | 57.8 | 16.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192_20210124.log.json) |
+| lower | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py) | 256x192 | 96.5 | 74.4 | 10.5 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192_20210124.log.json) |
+| full | [pose_resnet_50](td-hm_res50_8xb64-210e_deepfashion_full-256x192.py) | 256x192 | 97.7 | 66.4 | 12.7 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192_20210124.log.json) |
+
+Note: Due to the time constraints, we have only trained resnet50 models. We warmly welcome any contributions if you can successfully reproduce the results from the paper!
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml
new file mode 100644
index 0000000..228c49c
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/resnet_deepfashion.yml
@@ -0,0 +1,51 @@
+Collections:
+- Name: SimpleBaseline2D
+ Paper:
+ Title: Simple baselines for human pose estimation and tracking
+ URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html
+ README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md
+Models:
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: DeepFashion
+ Name: td-hm_res50_8xb64-210e_deepfashion_upper-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 57.8
+ EPE: 16.8
+ PCK@0.2: 95.4
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_upper_256x192-41794f03_20210124.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion
+ Name: td-hm_res50_8xb64-210e_deepfashion_lower-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 74.4
+ EPE: 96.5
+ PCK@0.2: 10.5
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_lower_256x192-1292a839_20210124.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion
+ Name: td-hm_res50_8xb64-210e_deepfashion_full-256x192
+ Results:
+ - Dataset: DeepFashion
+ Metrics:
+ AUC: 66.4
+ EPE: 12.7
+ PCK@0.2: 97.7
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion_full_256x192-0dbd6e42_20210124.pth
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..8044431
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py
@@ -0,0 +1,169 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=8,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..a89cbb5
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,169 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=4,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..b6b8cec
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,169 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=6,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..0edccd2
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_full-256x192.py
@@ -0,0 +1,26 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py'
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..e400284
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,26 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py'
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..2003311
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w32_udp_8xb64-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,26 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py'
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..f28a128
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py
@@ -0,0 +1,42 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_full-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(in_channels=48))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..e823833
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,42 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_lower-256x192.py' # noqa
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(in_channels=48))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..a819a55
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,42 @@
+_base_ = './td-hm_hrnet-w32_8xb64-210e_deepfashion_upper-256x192.py' # noqa
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(in_channels=48))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..ad0e2e0
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_full-256x192.py
@@ -0,0 +1,31 @@
+_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_full-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..91c6ec2
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,31 @@
+_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_lower-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..ba707f6
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_hrnet-w48_udp_8xb32-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,31 @@
+_base_ = './td-hm_hrnet-w48_8xb32-210e_deepfashion_upper-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+model = dict(
+ test_cfg=dict(flip_test=True, flip_mode='heatmap', shift_heatmap=False))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True),
+ dict(type='PackPoseInputs')
+]
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..15e5b54
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_full-256x192.py
@@ -0,0 +1,8 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_full-256x192.py'
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet101')))
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..78a3879
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,8 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py'
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet101')))
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..449ef14
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res101_8xb64-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,8 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py'
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet101')))
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..a9a79d8
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_full-256x192.py
@@ -0,0 +1,13 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_full-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet152')))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..9d1cca6
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,13 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet152')))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..68f85cf
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res152_8xb32-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,13 @@
+_base_ = './td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py'
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+model = dict(
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained',
+ checkpoint='torchvision://resnet152')))
+
+train_dataloader = dict(batch_size=32)
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py
new file mode 100644
index 0000000..e883794
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_full-256x192.py
@@ -0,0 +1,140 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=8,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='full',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_full_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py
new file mode 100644
index 0000000..8723ab7
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_lower-256x192.py
@@ -0,0 +1,140 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=64)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=4,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='lower',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_lower_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py
new file mode 100644
index 0000000..9541987
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion/td-hm_res50_8xb64-210e_deepfashion_upper-256x192.py
@@ -0,0 +1,140 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=64)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=6,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashionDataset'
+data_mode = 'topdown'
+data_root = 'data/fld/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+test_pipeline = val_pipeline
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_train.json',
+ data_prefix=dict(img='img/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_val.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ subset='upper',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/fld_upper_test.json',
+ data_prefix=dict(img='img/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md
new file mode 100644
index 0000000..c19eced
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfashion2.md
@@ -0,0 +1,67 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+DeepFashion2 (CVPR'2019)
+
+```bibtex
+@article{DeepFashion2,
+ author = {Yuying Ge and Ruimao Zhang and Lingyun Wu and Xiaogang Wang and Xiaoou Tang and Ping Luo},
+ title={A Versatile Benchmark for Detection, Pose Estimation, Segmentation and Re-Identification of Clothing Images},
+ journal={CVPR},
+ year={2019}
+}
+```
+
+
+
+Results on DeepFashion2 val set
+
+| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :-------------------- | :-------------------------------------------------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------: | :-------------------------------------------------: |
+| short_sleeved_shirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py) | 256x192 | 0.988 | 0.703 | 10.2 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192-21e1c5da_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192_20221208.log.json) |
+| long_sleeved_shirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py) | 256x192 | 0.973 | 0.587 | 16.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192-8679e7e3_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192_20221208.log.json) |
+| short_sleeved_outwear | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py) | 256x192 | 0.966 | 0.408 | 24.0 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192-a04c1298_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192_20221208.log.json) |
+| long_sleeved_outwear | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py) | 256x192 | 0.987 | 0.517 | 18.1 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192-31fbaecf_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192_20221208.log.json) |
+| vest | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py) | 256x192 | 0.981 | 0.643 | 12.7 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192-4c48d05c_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192_20221208.log.json) |
+| sling | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py) | 256x192 | 0.940 | 0.557 | 21.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192-ebb2b736_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192_20221208.log.json) |
+| shorts | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py) | 256x192 | 0.975 | 0.682 | 12.4 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192-9ab23592_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192_20221208.log.json) |
+| trousers | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py) | 256x192 | 0.973 | 0.625 | 14.8 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192-3e632257_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192_20221208.log.json) |
+| skirt | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py) | 256x192 | 0.952 | 0.653 | 16.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192-09573469_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192_20221208.log.json) |
+| short_sleeved_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py) | 256x192 | 0.980 | 0.603 | 15.6 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192-1345b07a_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192_20221208.log.json) |
+| long_sleeved_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py) | 256x192 | 0.976 | 0.518 | 20.1 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192-87bac74e_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192_20221208.log.json) |
+| vest_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py) | 256x192 | 0.980 | 0.600 | 16.0 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192-fb3fbd6f_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192_20221208.log.json) |
+| sling_dress | [pose_resnet_50](/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py) | 256x192 | 0.967 | 0.544 | 19.5 | [ckpt](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192-8ebae0eb_20221208.pth) | [log](https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192_20221208.log.json) |
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml
new file mode 100644
index 0000000..61b8a65
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/res50_deepfasion2.yml
@@ -0,0 +1,185 @@
+Models:
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: DeepFashion2
+ Name: td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.703
+ EPE: 10.2
+ PCK@0.2: 0.988
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_shirt_256x192-21e1c5da_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.587
+ EPE: 16.5
+ PCK@0.2: 0.973
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_shirt_256x192-8679e7e3_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.408
+ EPE: 24.0
+ PCK@0.2: 0.966
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_outwear_256x192-a04c1298_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.517
+ EPE: 18.1
+ PCK@0.2: 0.987
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_outwear_256x192-31fbaecf_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_4xb64-210e_deepfasion2-vest-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.643
+ EPE: 12.7
+ PCK@0.2: 0.981
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_256x192-4c48d05c_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_4xb64-210e_deepfasion2-sling-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.557
+ EPE: 21.6
+ PCK@0.2: 0.94
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_256x192-ebb2b736_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.682
+ EPE: 12.4
+ PCK@0.2: 0.975
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_shorts_256x192-9ab23592_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.625
+ EPE: 14.8
+ PCK@0.2: 0.973
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_trousers_256x192-3e632257_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.653
+ EPE: 16.6
+ PCK@0.2: 0.952
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_skirt_256x192-09573469_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.603
+ EPE: 15.6
+ PCK@0.2: 0.98
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_short_sleeved_dress_256x192-1345b07a_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.518
+ EPE: 20.1
+ PCK@0.2: 0.976
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_long_sleeved_dress_256x192-87bac74e_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.6
+ EPE: 16.0
+ PCK@0.2: 0.98
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_vest_dress_256x192-fb3fbd6f_20221208.pth
+- Config: configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: DeepFashion2
+ Name: td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192
+ Results:
+ - Dataset: DeepFashion2
+ Metrics:
+ AUC: 0.544
+ EPE: 19.5
+ PCK@0.2: 0.967
+ Task: Fashion 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/fashion/resnet/res50_deepfashion2_sling_dress_256x192-8ebae0eb_20221208.pth
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py
new file mode 100644
index 0000000..437b9aa
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-long-sleeved-dress-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=64)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_long_sleeved_dress_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_long_sleeved_dress_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py
new file mode 100644
index 0000000..3b8ec62
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-skirt-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=64)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_skirt_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_skirt_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py
new file mode 100644
index 0000000..1883314
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_1xb64-210e_deepfasion2-vest-dress-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=64)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_vest_dress_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_vest_dress_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py
new file mode 100644
index 0000000..a5f6637
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_2xb64-210e_deepfasion2-trousers-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=128)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_trousers_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_trousers_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py
new file mode 100644
index 0000000..0a00361
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_3xb64-210e_deepfasion2-shorts-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=192)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_shorts_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_shorts_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py
new file mode 100644
index 0000000..d865565
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-short-sleeved-dress-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_short_sleeved_dress_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_short_sleeved_dress_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py
new file mode 100644
index 0000000..eb42c72
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_sling_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_sling_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py
new file mode 100644
index 0000000..8d206f3
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-sling-dress-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_sling_dress_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_sling_dress_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py
new file mode 100644
index 0000000..c0ed06d
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_4xb64-210e_deepfasion2-vest-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_vest_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_vest_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py
new file mode 100644
index 0000000..e1bbbe2
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_6xb64-210e_deepfasion2-short-sleeved-shirt-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=384)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_short_sleeved_shirt_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_short_sleeved_shirt_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py
new file mode 100644
index 0000000..2b36f62
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-outwear-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_long_sleeved_outwear_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/'
+ 'deepfashion2_long_sleeved_outwear_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py
new file mode 100644
index 0000000..8d25b31
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-long-sleeved-shirt-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_long_sleeved_shirt_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/deepfashion2_long_sleeved_shirt_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py
new file mode 100644
index 0000000..9e381df
--- /dev/null
+++ b/modules/rtmpose/configs/fashion_2d_keypoint/topdown_heatmap/deepfashion2/td-hm_res50_8xb64-210e_deepfasion2-short-sleeved-outwear-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ logger=dict(type='LoggerHook', interval=10),
+ checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=294,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'DeepFashion2Dataset'
+data_mode = 'topdown'
+data_root = 'data/deepfasion2/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='train/deepfashion2_short_sleeved_outwear_train.json',
+ data_prefix=dict(img='train/image/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='validation/'
+ 'deepfashion2_short_sleeved_outwear_validation.json',
+ data_prefix=dict(img='validation/image/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/README.md b/modules/rtmpose/configs/hand_2d_keypoint/README.md
new file mode 100644
index 0000000..29b9066
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/README.md
@@ -0,0 +1,18 @@
+# 2D Hand Pose Estimation
+
+2D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image.
+
+Normally, the input images are cropped hand images, where the hand locates at the center;
+or the rough location (or the bounding box) of the hand is provided.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_hand_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/2d_hand_demo.md) to run demos.
+
+
+
+
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md
new file mode 100644
index 0000000..2da6481
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/README.md
@@ -0,0 +1,16 @@
+# RTMPose
+
+Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
+In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
+Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
+To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
+
+## Results and Models
+
+### COCO-WholeBody-Hand Dataset
+
+Results on COCO-WholeBody-Hand val set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-------: | :--------: | :-----: | :---: | :--: | :------------------------------------------------------------------------------------: |
+| RTMPose-m | 256x256 | 0.815 | 0.837 | 4.51 | [rtmpose_coco_wholebody_hand.md](./coco_wholebody_hand/rtmpose_coco_wholebody_hand.md) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..2199e09
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=21,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5],
+ rotate_factor=180),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=180),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md
new file mode 100644
index 0000000..edf0819
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md
@@ -0,0 +1,39 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [rtmpose_m](/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.815 | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml
new file mode 100644
index 0000000..2fb0378
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture:
+ - RTMPose
+ Training Data: COCO-WholeBody-Hand
+ Name: rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.815
+ EPE: 4.51
+ PCK@0.2: 0.837
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py
new file mode 100644
index 0000000..96b839f
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py
@@ -0,0 +1,380 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# coco-hand onehand10k freihand2d rhd2d halpehand
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 10
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(256, 256),
+ sigma=(5.66, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=21,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5],
+ rotate_factor=180),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ # dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=180),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.2),
+ dict(type='MedianBlur', p=0.2),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_onehand10k = dict(
+ type='OneHand10KDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='onehand10k/annotations/onehand10k_train.json',
+ data_prefix=dict(img='pose/OneHand10K/'),
+ pipeline=[],
+)
+
+dataset_freihand = dict(
+ type='FreiHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='freihand/annotations/freihand_train.json',
+ data_prefix=dict(img='pose/FreiHand/'),
+ pipeline=[],
+)
+
+dataset_rhd = dict(
+ type='Rhd2DDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='rhd/annotations/rhd_train.json',
+ data_prefix=dict(img='pose/RHD/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=21,
+ mapping=[
+ (0, 0),
+ (1, 4),
+ (2, 3),
+ (3, 2),
+ (4, 1),
+ (5, 8),
+ (6, 7),
+ (7, 6),
+ (8, 5),
+ (9, 12),
+ (10, 11),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+ (15, 14),
+ (16, 13),
+ (17, 20),
+ (18, 19),
+ (19, 18),
+ (20, 17),
+ ])
+ ],
+)
+
+dataset_halpehand = dict(
+ type='HalpeHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015/'),
+ pipeline=[],
+)
+
+# data loaders
+train_dataloader = dict(
+ batch_size=256,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(
+ from_file='configs/_base_/datasets/coco_wholebody_hand.py'),
+ datasets=[
+ dataset_coco, dataset_onehand10k, dataset_freihand, dataset_rhd,
+ dataset_halpehand
+ ],
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+# test datasets
+val_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+val_onehand10k = dict(
+ type='OneHand10KDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='onehand10k/annotations/onehand10k_test.json',
+ data_prefix=dict(img='pose/OneHand10K/'),
+ pipeline=[],
+)
+
+val_freihand = dict(
+ type='FreiHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='freihand/annotations/freihand_test.json',
+ data_prefix=dict(img='pose/FreiHand/'),
+ pipeline=[],
+)
+
+val_rhd = dict(
+ type='Rhd2DDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='rhd/annotations/rhd_test.json',
+ data_prefix=dict(img='pose/RHD/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=21,
+ mapping=[
+ (0, 0),
+ (1, 4),
+ (2, 3),
+ (3, 2),
+ (4, 1),
+ (5, 8),
+ (6, 7),
+ (7, 6),
+ (8, 5),
+ (9, 12),
+ (10, 11),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+ (15, 14),
+ (16, 13),
+ (17, 20),
+ (18, 19),
+ (19, 18),
+ (20, 17),
+ ])
+ ],
+)
+
+val_halpehand = dict(
+ type='HalpeHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_val_v1.json',
+ data_prefix=dict(img='detection/coco/val2017/'),
+ pipeline=[],
+)
+
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(
+ from_file='configs/_base_/datasets/coco_wholebody_hand.py'),
+ datasets=[
+ val_coco, val_onehand10k, val_freihand, val_rhd, val_halpehand
+ ],
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+val_dataloader = test_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md
new file mode 100644
index 0000000..5da110e
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.md
@@ -0,0 +1,67 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO (ECCV'2014)
+
+```bibtex
+@inproceedings{lin2014microsoft,
+ title={Microsoft coco: Common objects in context},
+ author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
+ booktitle={European conference on computer vision},
+ pages={740--755},
+ year={2014},
+ organization={Springer}
+}
+```
+
+
+
+- `Hand5` and `*` denote model trained on 5 public datasets:
+ - [COCO-Wholebody-Hand](https://github.com/jin-s13/COCO-WholeBody/)
+ - [OneHand10K](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html)
+ - [FreiHand2d](https://lmb.informatik.uni-freiburg.de/projects/freihand/)
+ - [RHD2d](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html)
+ - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+
+| Config | Input Size | PCK@0.2
(COCO-Wholebody-Hand) | PCK@0.2
(Hand5) | AUC
(Hand5) | EPE
(Hand5) | FLOPS(G) | Download |
+| :---------------------------------------: | :--------: | :-----------------------------------: | :---------------------: | :-----------------: | :-----------------: | :------: | :-----------------------------------------: |
+| [RTMPose-m\*
(alpha version)](./rtmpose/hand_2d_keypoint/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 81.5 | 96.4 | 83.9 | 5.06 | 2.581 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml
new file mode 100644
index 0000000..a57b36c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/rtmpose/hand5/rtmpose_hand5.yml
@@ -0,0 +1,28 @@
+Collections:
+- Name: RTMPose
+ Paper:
+ Title: "RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose"
+ URL: https://arxiv.org/abs/2303.07399
+ README: https://github.com/open-mmlab/mmpose/blob/main/projects/rtmpose/README.md
+Models:
+- Config: configs/hand_2d_keypoint/rtmpose/hand5/rtmpose-m_8xb256-210e_hand5-256x256.py
+ In Collection: RTMPose
+ Alias: hand
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: &id002
+ - COCO-Wholebody-Hand
+ - OneHand10K
+ - FreiHand2d
+ - RHD2d
+ - Halpe
+ Name: rtmpose-m_8xb256-210e_hand5-256x256
+ Results:
+ - Dataset: Hand5
+ Metrics:
+ PCK@0.2: 0.964
+ AUC: 0.839
+ EPE: 5.06
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..969482a
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,55 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### COCO-WholeBody-Hand Dataset
+
+Results on COCO-WholeBody-Hand val set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :--------------: | :--------: | :-----: | :---: | :--: | :----------------------------------------------------------------------------------------------: |
+| HRNetv2-w18+Dark | 256x256 | 0.814 | 0.840 | 4.37 | [hrnetv2_dark_coco_wholebody_hand.md](./coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md) |
+| HRNetv2-w18 | 256x256 | 0.813 | 0.840 | 4.39 | [hrnetv2_coco_wholebody_hand.md](./coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md) |
+| HourglassNet | 256x256 | 0.804 | 0.835 | 4.54 | [hourglass_coco_wholebody_hand.md](./coco_wholebody_hand/hourglass_coco_wholebody_hand.md) |
+| SCNet-50 | 256x256 | 0.803 | 0.834 | 4.55 | [scnet_coco_wholebody_hand.md](./coco_wholebody_hand/scnet_coco_wholebody_hand.md) |
+| ResNet-50 | 256x256 | 0.800 | 0.833 | 4.64 | [resnet_coco_wholebody_hand.md](./coco_wholebody_hand/resnet_coco_wholebody_hand.md) |
+| LiteHRNet-18 | 256x256 | 0.795 | 0.830 | 4.77 | [litehrnet_coco_wholebody_hand.md](./coco_wholebody_hand/litehrnet_coco_wholebody_hand.md) |
+| MobileNet-v2 | 256x256 | 0.795 | 0.829 | 4.77 | [mobilenetv2_coco_wholebody_hand.md](./coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md) |
+
+### FreiHand Dataset
+
+Results on FreiHand val & test set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------------: |
+| ResNet-50 | 224x224 | 0.999 | 0.868 | 3.27 | [resnet_freihand2d.md](./freihand2d/resnet_freihand2d.md) |
+
+### OneHand10K Dataset
+
+Results on OneHand10K val set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :--------------: | :--------: | :-----: | :---: | :---: | :-------------------------------------------------------------------: |
+| HRNetv2-w18+Dark | 256x256 | 0.990 | 0.572 | 23.96 | [hrnetv2_dark_onehand10k.md](./onehand10k/hrnetv2_dark_onehand10k.md) |
+| HRNetv2-w18+UDP | 256x256 | 0.990 | 0.571 | 23.88 | [hrnetv2_udp_onehand10k.md](./onehand10k/hrnetv2_udp_onehand10k.md) |
+| HRNetv2-w18 | 256x256 | 0.990 | 0.567 | 24.26 | [hrnetv2_onehand10k.md](./onehand10k/hrnetv2_onehand10k.md) |
+| ResNet-50 | 256x256 | 0.989 | 0.555 | 25.16 | [resnet_onehand10k.md](./onehand10k/resnet_onehand10k.md) |
+| MobileNet-v2 | 256x256 | 0.986 | 0.537 | 28.56 | [mobilenetv2_onehand10k.md](./onehand10k/mobilenetv2_onehand10k.md) |
+
+### RHD Dataset
+
+Results on RHD test set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :--------------: | :--------: | :-----: | :---: | :--: | :----------------------------------------------------: |
+| HRNetv2-w18+Dark | 256x256 | 0.992 | 0.903 | 2.18 | [hrnetv2_dark_rhd2d.md](./rhd2d/hrnetv2_dark_rhd2d.md) |
+| HRNetv2-w18+UDP | 256x256 | 0.992 | 0.902 | 2.19 | [hrnetv2_udp_rhd2d.md](./rhd2d/hrnetv2_udp_rhd2d.md) |
+| HRNetv2-w18 | 256x256 | 0.992 | 0.902 | 2.21 | [hrnetv2_rhd2d.md](./rhd2d/hrnetv2_rhd2d.md) |
+| ResNet-50 | 256x256 | 0.991 | 0.898 | 2.32 | [resnet_rhd2d.md](./rhd2d/resnet_rhd2d.md) |
+| MobileNet-v2 | 256x256 | 0.985 | 0.883 | 2.79 | [mobilenetv2_rhd2d.md](./rhd2d/mobilenetv2_rhd2d.md) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md
new file mode 100644
index 0000000..2926593
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.md
@@ -0,0 +1,39 @@
+
+
+
+Hourglass (ECCV'2016)
+
+```bibtex
+@inproceedings{newell2016stacked,
+ title={Stacked hourglass networks for human pose estimation},
+ author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
+ booktitle={European conference on computer vision},
+ pages={483--499},
+ year={2016},
+ organization={Springer}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_hourglass_52](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.804 | 0.835 | 4.54 | [ckpt](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
new file mode 100644
index 0000000..21ff3f0
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: Hourglass
+ Metadata:
+ Architecture:
+ - Hourglass
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.835
+ EPE: 4.54
+ PCK@0.2: 0.804
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/hourglass/hourglass52_coco_wholebody_hand_256x256-7b05c6db_20210909.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md
new file mode 100644
index 0000000..eae4dce
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.md
@@ -0,0 +1,39 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.813 | 0.840 | 4.39 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
new file mode 100644
index 0000000..0190ac9
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.84
+ EPE: 4.39
+ PCK@0.2: 0.813
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_coco_wholebody_hand_256x256-1c028db7_20210908.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md
new file mode 100644
index 0000000..718baa7
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.md
@@ -0,0 +1,56 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.814 | 0.840 | 4.37 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark_20210908.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
new file mode 100644
index 0000000..f5b275a
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.84
+ EPE: 4.37
+ PCK@0.2: 0.814
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_coco_wholebody_hand_256x256_dark-a9228c9c_20210908.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md
new file mode 100644
index 0000000..1508d86
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.md
@@ -0,0 +1,37 @@
+
+
+
+LiteHRNet (CVPR'2021)
+
+```bibtex
+@inproceedings{Yulitehrnet21,
+ title={Lite-HRNet: A Lightweight High-Resolution Network},
+ author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+ booktitle={CVPR},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [LiteHRNet-18](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.795 | 0.830 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000..66c5713
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: LiteHRNet
+ Metadata:
+ Architecture:
+ - LiteHRNet
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.83
+ EPE: 4.77
+ PCK@0.2: 0.795
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/litehrnet/litehrnet_w18_coco_wholebody_hand_256x256-d6945e6a_20210908.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md
new file mode 100644
index 0000000..6b65bd0
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.md
@@ -0,0 +1,38 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_mobilenetv2](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.795 | 0.829 | 4.77 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
new file mode 100644
index 0000000..cc8c5a2
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.829
+ EPE: 4.77
+ PCK@0.2: 0.795
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_coco_wholebody_hand_256x256-06b8c877_20210909.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md
new file mode 100644
index 0000000..21693f1
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.md
@@ -0,0 +1,55 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.800 | 0.833 | 4.64 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256_20210908.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000..b663c5d
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.833
+ EPE: 4.64
+ PCK@0.2: 0.8
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_coco_wholebody_hand_256x256-8dbc750c_20210908.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md
new file mode 100644
index 0000000..1cf44e2
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.md
@@ -0,0 +1,38 @@
+
+
+
+SCNet (CVPR'2020)
+
+```bibtex
+@inproceedings{liu2020improving,
+ title={Improving Convolutional Networks with Self-Calibrated Convolutions},
+ author={Liu, Jiang-Jiang and Hou, Qibin and Cheng, Ming-Ming and Wang, Changhu and Feng, Jiashi},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={10096--10105},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody-Hand (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody-Hand val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_scnet_50](/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.803 | 0.834 | 4.55 | [ckpt](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth) | [log](https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256_20210909.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
new file mode 100644
index 0000000..0fd05eb
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SCNet
+ Training Data: COCO-WholeBody-Hand
+ Name: td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256
+ Results:
+ - Dataset: COCO-WholeBody-Hand
+ Metrics:
+ AUC: 0.834
+ EPE: 4.55
+ PCK@0.2: 0.803
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/scnet/scnet50_coco_wholebody_hand_256x256-e73414c7_20210909.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..05b1ad1
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HourglassNet',
+ num_stacks=1,
+ ),
+ head=dict(
+ type='CPMHead',
+ in_channels=256,
+ out_channels=21,
+ num_stages=1,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=180.0,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..be8d278
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..1c0f1c3
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..c160687
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,136 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='LiteHRNet',
+ in_channels=3,
+ extra=dict(
+ stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+ num_stages=3,
+ stages_spec=dict(
+ num_modules=(2, 4, 2),
+ num_branches=(2, 3, 4),
+ num_blocks=(2, 2, 2),
+ module_type=('LITE', 'LITE', 'LITE'),
+ with_fuse=(True, True, True),
+ reduce_ratios=(8, 8, 8),
+ num_channels=(
+ (40, 80),
+ (40, 80, 160),
+ (40, 80, 160, 320),
+ )),
+ with_head=True,
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=40,
+ out_channels=21,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..e68449f
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,120 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..e7b9ff6
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,119 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py
new file mode 100644
index 0000000..f65ea47
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='SCNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/scnet50-7ef0a199.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyHandDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE')
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md
new file mode 100644
index 0000000..33a57aa
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.md
@@ -0,0 +1,56 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+FreiHand (ICCV'2019)
+
+```bibtex
+@inproceedings{zimmermann2019freihand,
+ title={Freihand: A dataset for markerless capture of hand pose and shape from single rgb images},
+ author={Zimmermann, Christian and Ceylan, Duygu and Yang, Jimei and Russell, Bryan and Argus, Max and Brox, Thomas},
+ booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+ pages={813--822},
+ year={2019}
+}
+```
+
+
+
+Results on FreiHand val & test set
+
+| Set | Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--- | :-------------------------------------------------------: | :--------: | :-----: | :---: | :--: | :-------------------------------------------------------: | :------------------------------------------------------: |
+| test | [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py) | 224x224 | 0.999 | 0.868 | 3.27 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224_20200914.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml
new file mode 100644
index 0000000..925f440
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/resnet_freihand2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: FreiHand
+ Name: td-hm_res50_8xb64-100e_freihand2d-224x224
+ Results:
+ - Dataset: FreiHand
+ Metrics:
+ AUC: 0.868
+ EPE: 3.27
+ PCK@0.2: 0.999
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_freihand_224x224-ff0799bc_20200914.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py
new file mode 100644
index 0000000..677ca31
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py
@@ -0,0 +1,138 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=100, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=100,
+ milestones=[50, 70],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='AUC', rule='greater', interval=1))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(224, 224), heatmap_size=(56, 56), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'FreiHandDataset'
+data_mode = 'topdown'
+data_root = 'data/freihand/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.25,
+ rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale', padding=0.8),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/freihand_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/freihand_val.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/freihand_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md
new file mode 100644
index 0000000..88fb8e4
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.md
@@ -0,0 +1,60 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.572 | 23.96 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
new file mode 100644
index 0000000..d02795c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: OneHand10K
+ Name: td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.572
+ EPE: 23.96
+ PCK@0.2: 0.99
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_onehand10k_256x256_dark-a2f80c64_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md
new file mode 100644
index 0000000..41bed70
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.md
@@ -0,0 +1,43 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.567 | 24.26 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
new file mode 100644
index 0000000..f5ee14c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: OneHand10K
+ Name: td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.567
+ EPE: 24.26
+ PCK@0.2: 0.99
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_onehand10k_256x256-30bc9c6b_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md
new file mode 100644
index 0000000..0507035
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.md
@@ -0,0 +1,60 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_hrnetv2_w18_udp](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.571 | 23.88 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
new file mode 100644
index 0000000..903f047
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py
+ In Collection: UDP
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - UDP
+ Training Data: OneHand10K
+ Name: td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.571
+ EPE: 23.88
+ PCK@0.2: 0.99
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_onehand10k_256x256_udp-0d1b515d_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md
new file mode 100644
index 0000000..b89b1d1
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.md
@@ -0,0 +1,42 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_mobilenet_v2](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.986 | 0.537 | 28.56 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
new file mode 100644
index 0000000..4090189
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: OneHand10K
+ Name: td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.537
+ EPE: 28.56
+ PCK@0.2: 0.986
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_onehand10k_256x256-f3a3d90e_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md
new file mode 100644
index 0000000..2498536
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.md
@@ -0,0 +1,59 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_resnet_50](/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py) | 256x256 | 0.989 | 0.555 | 25.16 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml
new file mode 100644
index 0000000..f30171d
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: OneHand10K
+ Name: td-hm_res50_8xb32-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.555
+ EPE: 25.16
+ PCK@0.2: 0.989
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..499a11a
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..08b6588
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..0dd9402
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..63f6af3
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='mmcls://mobilenet_v2',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..11b549c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md
new file mode 100644
index 0000000..2fc7d85
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.md
@@ -0,0 +1,58 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_hrnetv2_w18_dark](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.903 | 2.18 | [ckpt](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
new file mode 100644
index 0000000..9dde35d
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - DarkPose
+ Training Data: RHD
+ Name: td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.903
+ EPE: 2.18
+ PCK@0.2: 0.992
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/dark/hrnetv2_w18_rhd2d_256x256_dark-4df3a347_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md
new file mode 100644
index 0000000..1703e8c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.md
@@ -0,0 +1,41 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_hrnetv2_w18](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.902 | 2.21 | [ckpt](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
new file mode 100644
index 0000000..8415f3c
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
@@ -0,0 +1,16 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py
+ In Collection: HRNetv2
+ Metadata:
+ Architecture:
+ - HRNetv2
+ Training Data: RHD
+ Name: td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.902
+ EPE: 2.21
+ PCK@0.2: 0.992
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/hrnetv2/hrnetv2_w18_rhd2d_256x256-95b20dd8_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md
new file mode 100644
index 0000000..da766c4
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.md
@@ -0,0 +1,58 @@
+
+
+
+HRNetv2 (TPAMI'2019)
+
+```bibtex
+@article{WangSCJDZLMTWLX19,
+ title={Deep High-Resolution Representation Learning for Visual Recognition},
+ author={Jingdong Wang and Ke Sun and Tianheng Cheng and
+ Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and
+ Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao},
+ journal={TPAMI},
+ year={2019}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCKh@0.7 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :------: | :---: | :--: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [pose_hrnetv2_w18_udp](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.992 | 0.902 | 2.19 | [ckpt](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
new file mode 100644
index 0000000..148da23
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py
+ In Collection: UDP
+ Metadata:
+ Architecture:
+ - HRNetv2
+ - UDP
+ Training Data: RHD
+ Name: td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.902
+ EPE: 2.19
+ PCKh@0.7: 0.992
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/udp/hrnetv2_w18_rhd2d_256x256_udp-63ba6007_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md
new file mode 100644
index 0000000..19506c5
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.md
@@ -0,0 +1,40 @@
+
+
+
+MobilenetV2 (CVPR'2018)
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_mobilenet_v2](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.985 | 0.883 | 2.79 | [ckpt](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
new file mode 100644
index 0000000..0d1bd76
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - MobilenetV2
+ Training Data: RHD
+ Name: td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.883
+ EPE: 2.79
+ PCK@0.2: 0.985
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/mobilenetv2/mobilenetv2_rhd2d_256x256-85fa02db_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md
new file mode 100644
index 0000000..843bd75
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.md
@@ -0,0 +1,57 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [pose_resnet50](/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.991 | 0.898 | 2.32 | [ckpt](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml
new file mode 100644
index 0000000..30cf36b
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/resnet_rhd2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture:
+ - SimpleBaseline2D
+ - ResNet
+ Training Data: RHD
+ Name: td-hm_res50_8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.898
+ EPE: 2.32
+ PCK@0.2: 0.991
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_rhd2d_256x256-5dc7e4cc_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..e9cb89b
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..eac5595
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,162 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(256, 256),
+ heatmap_size=(64, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..c2a672b
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,158 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144),
+ multiscale_output=True),
+ upsample=dict(mode='bilinear', align_corners=False)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://msra/hrnetv2_w18',
+ )),
+ neck=dict(
+ type='FeatureMapProcessor',
+ concat=True,
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=270,
+ out_channels=21,
+ deconv_out_channels=None,
+ conv_out_channels=(270, ),
+ conv_kernel_sizes=(1, ),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..68ee857
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,125 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='MobileNetV2',
+ widen_factor=1.,
+ out_indices=(7, ),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='mmcls://mobilenet_v2',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1280,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..07d04dc
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,124 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='torchvision://resnet50',
+ )),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=21,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md
new file mode 100644
index 0000000..2fe838b
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/README.md
@@ -0,0 +1,25 @@
+# Top-down regression-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, regression based methods directly regress the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Deeppose: Human pose estimation via deep neural networks](http://openaccess.thecvf.com/content_cvpr_2014/html/Toshev_DeepPose_Human_Pose_2014_CVPR_paper.html).
+
+
+

+
+
+## Results and Models
+
+### OneHand10K Dataset
+
+Results on OneHand10K val set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-------: | :--------: | :-----: | :---: | :---: | :-------------------------------------------------------: |
+| ResNet-50 | 256x256 | 0.990 | 0.485 | 34.21 | [resnet_onehand10k.md](./onehand10k/resnet_onehand10k.md) |
+
+### RHD Dataset
+
+Results on RHD test set
+
+| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download |
+| :-------: | :--------: | :-----: | :---: | :--: | :----------------------------------------: |
+| ResNet-50 | 256x256 | 0.988 | 0.865 | 3.32 | [resnet_rhd2d.md](./rhd2d/resnet_rhd2d.md) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md
new file mode 100644
index 0000000..9e9e603
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.md
@@ -0,0 +1,59 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+OneHand10K (TCSVT'2019)
+
+```bibtex
+@article{wang2018mask,
+ title={Mask-pose cascaded cnn for 2d hand pose estimation from single color image},
+ author={Wang, Yangang and Peng, Cong and Liu, Yebin},
+ journal={IEEE Transactions on Circuits and Systems for Video Technology},
+ volume={29},
+ number={11},
+ pages={3258--3268},
+ year={2018},
+ publisher={IEEE}
+}
+```
+
+
+
+Results on OneHand10K val set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :---: | :--------------------------------------------------------: | :-------------------------------------------------------: |
+| [deeppose_resnet_50](/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py) | 256x256 | 0.990 | 0.485 | 34.21 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml
new file mode 100644
index 0000000..6b92014
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/resnet_onehand10k.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture:
+ - DeepPose
+ - ResNet
+ Training Data: OneHand10K
+ Name: td-reg_res50_8xb64-210e_onehand10k-256x256
+ Results:
+ - Dataset: OneHand10K
+ Metrics:
+ AUC: 0.485
+ EPE: 34.21
+ PCK@0.2: 0.99
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_onehand10k_256x256-cbddf43a_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py
new file mode 100644
index 0000000..4d33893
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=21,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'OneHand10KDataset'
+data_mode = 'topdown'
+data_root = 'data/onehand10k/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/onehand10k_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md
new file mode 100644
index 0000000..25ae62b
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.md
@@ -0,0 +1,57 @@
+
+
+
+DeepPose (CVPR'2014)
+
+```bibtex
+@inproceedings{toshev2014deeppose,
+ title={Deeppose: Human pose estimation via deep neural networks},
+ author={Toshev, Alexander and Szegedy, Christian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={1653--1660},
+ year={2014}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+RHD (ICCV'2017)
+
+```bibtex
+@TechReport{zb2017hand,
+ author={Christian Zimmermann and Thomas Brox},
+ title={Learning to Estimate 3D Hand Pose from Single RGB Images},
+ institution={arXiv:1705.01389},
+ year={2017},
+ note="https://arxiv.org/abs/1705.01389",
+ url="https://lmb.informatik.uni-freiburg.de/projects/hand3d/"
+}
+```
+
+
+
+Results on RHD test set
+
+| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log |
+| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: |
+| [deeppose_resnet_50](/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py) | 256x256 | 0.988 | 0.865 | 3.32 | [ckpt](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth) | [log](https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256_20210330.log.json) |
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml
new file mode 100644
index 0000000..705329d
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/resnet_rhd2d.yml
@@ -0,0 +1,17 @@
+Models:
+- Config: configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py
+ In Collection: DeepPose
+ Metadata:
+ Architecture:
+ - DeepPose
+ - ResNet
+ Training Data: RHD
+ Name: td-reg_res50_8xb64-210e_rhd2d-256x256
+ Results:
+ - Dataset: RHD
+ Metrics:
+ AUC: 0.865
+ EPE: 3.32
+ PCK@0.2: 0.988
+ Task: Hand 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand/deeppose/deeppose_res50_rhd2d_256x256-37f1c4d3_20210330.pth
diff --git a/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py
new file mode 100644
index 0000000..7591892
--- /dev/null
+++ b/modules/rtmpose/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater'))
+
+# codec settings
+codec = dict(type='RegressionLabel', input_size=(256, 256))
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ neck=dict(type='GlobalAveragePooling'),
+ head=dict(
+ type='RegressionHead',
+ in_channels=2048,
+ num_joints=21,
+ loss=dict(type='SmoothL1Loss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'Rhd2DDataset'
+data_mode = 'topdown'
+data_root = 'data/rhd/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(
+ type='RandomBBoxTransform', rotate_factor=180,
+ scale_factor=(0.7, 1.3)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_train.json',
+ data_prefix=dict(img=''),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/rhd_test.json',
+ data_prefix=dict(img=''),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+ dict(type='PCKAccuracy', thr=0.2),
+ dict(type='AUC'),
+ dict(type='EPE'),
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_3d_keypoint/README.md b/modules/rtmpose/configs/hand_3d_keypoint/README.md
new file mode 100644
index 0000000..752fd92
--- /dev/null
+++ b/modules/rtmpose/configs/hand_3d_keypoint/README.md
@@ -0,0 +1,7 @@
+# 3D Hand Pose Estimation
+
+3D hand pose estimation is defined as the task of detecting the poses (or keypoints) of the hand from an input image.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/3d_hand_keypoint.md) to prepare data.
diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md b/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md
new file mode 100644
index 0000000..d3d1260
--- /dev/null
+++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/README.md
@@ -0,0 +1,10 @@
+# InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image
+
+## Results and Models
+
+### InterHand2.6m 3D Dataset
+
+| Arch | Set | MPJPE-single | MPJPE-interacting | MPJPE-all | MRRPE | APh | ckpt | log | Details and Download |
+| :------------------------------- | :-------: | :----------: | :---------------: | :-------: | :---: | :--: | :------------------------------: | :-----------------------------: | :-----------------------------------------------: |
+| [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | test(H+M) | 9.47 | 13.40 | 11.59 | 29.28 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | [internet_interhand3d.md](./interhand3d/internet_interhand3d.md) |
+| [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | val(M) | 11.22 | 15.23 | 13.16 | 31.73 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) | [internet_interhand3d.md](./interhand3d/internet_interhand3d.md) |
diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md
new file mode 100644
index 0000000..b706b75
--- /dev/null
+++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.md
@@ -0,0 +1,59 @@
+
+
+
+InterNet (ECCV'2020)
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+
+
+
+
+
+ResNet (CVPR'2016)
+
+```bibtex
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+
+
+
+
+
+InterHand2.6M (ECCV'2020)
+
+```bibtex
+@InProceedings{Moon_2020_ECCV_InterHand2.6M,
+author = {Moon, Gyeongsik and Yu, Shoou-I and Wen, He and Shiratori, Takaaki and Lee, Kyoung Mu},
+title = {InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation from a Single RGB Image},
+booktitle = {European Conference on Computer Vision (ECCV)},
+year = {2020}
+}
+```
+
+
+
+Results on InterHand2.6M val & test set
+
+| Train Set | Set | Arch | Input Size | MPJPE-single | MPJPE-interacting | MPJPE-all | MRRPE | APh | ckpt | log |
+| :-------- | :-------- | :----------------------------------------: | :--------: | :----------: | :---------------: | :-------: | :---: | :--: | :----------------------------------------: | :---------------------------------------: |
+| All | test(H+M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 9.69 | 13.72 | 11.86 | 29.27 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.pth) | [log](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.json) |
+| All | val(M) | [InterNet_resnet_50](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 11.30 | 15.57 | 13.36 | 32.15 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.pth) | [log](https://download.openmmlab.com/mmpose/v1/hand_3d_keypoint/internet/interhand3d/internet_res50_interhand3d-d6ff20d6_20230913.json) |
+| All | test(H+M) | [InterNet_resnet_50\*](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 9.47 | 13.40 | 11.59 | 29.28 | 0.99 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) |
+| All | val(M) | [InterNet_resnet_50\*](/configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/res50_interhand3d_all_256x256.py) | 256x256 | 11.22 | 15.23 | 13.16 | 31.73 | 0.98 | [ckpt](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth) | [log](https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256_20210702.log.json) |
+
+*Models with * are trained in [MMPose 0.x](https://github.com/open-mmlab/mmpose/tree/0.x). The checkpoints and logs are only for validation.*
diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml
new file mode 100644
index 0000000..9446755
--- /dev/null
+++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_interhand3d.yml
@@ -0,0 +1,35 @@
+Collections:
+- Name: InterNet
+ Paper:
+ Title: 'InterHand2.6M: A Dataset and Baseline for 3D Interacting Hand Pose Estimation
+ from a Single RGB Image'
+ URL: https://link.springer.com/content/pdf/10.1007/978-3-030-58565-5_33.pdf
+ README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/internet.md
+Models:
+- Config: configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py
+ In Collection: InterNet
+ Alias: hand3d
+ Metadata:
+ Architecture: &id001
+ - InterNet
+ - ResNet
+ Training Data: InterHand2.6M
+ Name: internet_res50_4xb16-20e_interhand3d-256x256
+ Results:
+ - Dataset: InterHand2.6M (H+M)
+ Metrics:
+ APh: 0.99
+ MPJPE-all: 11.86
+ MPJPE-interacting: 13.72
+ MPJPE-single: 9.69
+ MRRPE: 29.27
+ Task: Hand 3D Keypoint
+ - Dataset: InterHand2.6M (M)
+ Metrics:
+ APh: 0.98
+ MPJPE-all: 13.36
+ MPJPE-interacting: 15.57
+ MPJPE-single: 11.30
+ MRRPE: 32.15
+ Task: Hand 3D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/hand3d/internet/res50_intehand3dv1.0_all_256x256-42b7f2ac_20210702.pth
diff --git a/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py
new file mode 100644
index 0000000..49951e7
--- /dev/null
+++ b/modules/rtmpose/configs/hand_3d_keypoint/internet/interhand3d/internet_res50_4xb16-20e_interhand3d-256x256.py
@@ -0,0 +1,182 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# visualization
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=20, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.0002))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ milestones=[15, 17],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=128)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ type='CheckpointHook',
+ interval=1,
+ save_best='MPJPE_all',
+ rule='less',
+ max_keep_ckpts=1),
+ logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+codec = dict(
+ type='Hand3DHeatmap',
+ image_size=[256, 256],
+ root_heatmap_size=64,
+ heatmap_size=[64, 64, 64],
+ sigma=2.5,
+ max_bound=255,
+ depth_size=64)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ head=dict(
+ type='InternetHead',
+ keypoint_head_cfg=dict(
+ in_channels=2048,
+ out_channels=21 * 64,
+ depth_size=codec['depth_size'],
+ deconv_out_channels=(256, 256, 256),
+ deconv_kernel_sizes=(4, 4, 4),
+ ),
+ root_head_cfg=dict(
+ in_channels=2048,
+ heatmap_size=codec['root_heatmap_size'],
+ hidden_dims=(512, ),
+ ),
+ hand_type_head_cfg=dict(
+ in_channels=2048,
+ num_labels=2,
+ hidden_dims=(512, ),
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=False))
+
+# base dataset settings
+dataset_type = 'InterHand3DDataset'
+data_mode = 'topdown'
+data_root = 'data/interhand2.6m/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='HandRandomFlip', prob=0.5),
+ dict(type='RandomBBoxTransform', rotate_factor=90.0),
+ dict(type='TopdownAffine', input_size=codec['image_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'rotation', 'img_shape',
+ 'focal', 'principal_pt', 'input_size', 'input_center',
+ 'input_scale', 'hand_type', 'hand_type_valid', 'flip',
+ 'flip_indices', 'abs_depth'))
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['image_size']),
+ dict(
+ type='PackPoseInputs',
+ meta_keys=('id', 'img_id', 'img_path', 'rotation', 'img_shape',
+ 'focal', 'principal_pt', 'input_size', 'input_center',
+ 'input_scale', 'hand_type', 'hand_type_valid', 'flip',
+ 'flip_indices', 'abs_depth'))
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='annotations/all/InterHand2.6M_train_camera.json',
+ joint_file='annotations/all/InterHand2.6M_train_joint_3d.json',
+ use_gt_root_depth=True,
+ rootnet_result_file=None,
+ data_mode=data_mode,
+ data_root=data_root,
+ data_prefix=dict(img='images/train/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotations/machine_annot/InterHand2.6M_val_data.json',
+ camera_param_file='annotations/machine_annot/'
+ 'InterHand2.6M_val_camera.json',
+ joint_file='annotations/machine_annot/InterHand2.6M_val_joint_3d.json',
+ use_gt_root_depth=True,
+ rootnet_result_file=None,
+ data_mode=data_mode,
+ data_root=data_root,
+ data_prefix=dict(img='images/val/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+test_dataloader = dict(
+ batch_size=16,
+ num_workers=1,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file='annotations/all/'
+ 'InterHand2.6M_test_data.json',
+ camera_param_file='annotations/all/'
+ 'InterHand2.6M_test_camera.json',
+ joint_file='annotations/all/'
+ 'InterHand2.6M_test_joint_3d.json',
+ use_gt_root_depth=True,
+ rootnet_result_file=None,
+ data_mode=data_mode,
+ data_root=data_root,
+ data_prefix=dict(img='images/test/'),
+ pipeline=val_pipeline,
+ test_mode=True,
+ ))
+
+# evaluators
+val_evaluator = [
+ dict(type='InterHandMetric', modes=['MPJPE', 'MRRPE', 'HandednessAcc'])
+]
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/hand_gesture/README.md b/modules/rtmpose/configs/hand_gesture/README.md
new file mode 100644
index 0000000..aaf4235
--- /dev/null
+++ b/modules/rtmpose/configs/hand_gesture/README.md
@@ -0,0 +1,13 @@
+# Gesture Recognition
+
+Gesture recognition aims to recognize the hand gestures in the video, such as thumbs up.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_hand_gesture.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/gesture_recognition_demo.md) to run the demo.
+
+
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/README.md
new file mode 100644
index 0000000..305dd96
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/README.md
@@ -0,0 +1,19 @@
+# 2D Human Whole-Body Pose Estimation
+
+2D human whole-body pose estimation aims to localize dense landmarks on the entire human body including face, hands, body, and feet.
+
+Existing approaches can be categorized into top-down and bottom-up approaches.
+
+Top-down methods divide the task into two stages: human detection and whole-body pose estimation. They perform human detection first, followed by single-person whole-body pose estimation given human bounding boxes.
+
+Bottom-up approaches (e.g. AE) first detect all the whole-body keypoints and then group/associate them into person instances.
+
+## Data preparation
+
+Please follow [DATA Preparation](/docs/en/dataset_zoo/2d_wholebody_keypoint.md) to prepare data.
+
+## Demo
+
+Please follow [Demo](/demo/docs/en/2d_wholebody_pose_demo.md) to run demos.
+
+
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md
new file mode 100644
index 0000000..7bc9529
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/README.md
@@ -0,0 +1,63 @@
+# DWPose
+
+Whole-body pose estimation localizes the human body, hand, face, and foot keypoints in an image. This task is challenging due to multi-scale body parts, fine-grained localization for low-resolution regions, and data scarcity. Meanwhile, applying a highly efficient and accurate pose estimator to widely human-centric understanding and generation tasks is urgent. In this work, we present a two-stage pose **D**istillation for **W**hole-body **P**ose estimators, named **DWPose**, to improve their effectiveness and efficiency. The first-stage distillation designs a weight-decay strategy while utilizing a teacher's intermediate feature and final logits with both visible and invisible keypoints to supervise the student from scratch. The second stage distills the student model itself to further improve performance. Different from the previous self-knowledge distillation, this stage finetunes the student's head with only 20% training time as a plug-and-play training strategy. For data limitations, we explore the UBody dataset that contains diverse facial expressions and hand gestures for real-life applications. Comprehensive experiments show the superiority of our proposed simple yet effective methods. We achieve new state-of-the-art performance on COCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from 64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a series of models with different sizes, from tiny to large, for satisfying various downstream tasks.
+
+## Results and Models
+
+### COCO-WholeBody Dataset
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+- DWPose Models are supported by [DWPose](https://github.com/IDEA-Research/DWPose)
+- Models are trained and distilled on the following datasets:
+ - [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody/)
+ - [UBody](https://github.com/IDEA-Research/OSX)
+
+| Config | S1 Dis_config | S2 Dis_config | Input Size | Whole AP | Whole AR | FLOPS
(G) | ORT-Latency
(ms)
(i7-11700) | TRT-FP16-Latency
(ms)
(GTX 1660Ti) | Download |
+| :----------- | :-----------------: | :-----------------: | :--------: | :------: | :------: | :---------------: | :-----------------------------------------: | :------------------------------------------------: | :------------: |
+| [DWPose-t](../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-t](../dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py) | [DW t-t](../dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py) | 256x192 | 48.5 | 58.4 | 0.5 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-ucoco_dw-ucoco_270e-256x192-dcf277bf_20230728.pth) |
+| [DWPose-s](../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-s](../dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py) | [DW s-s](../dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py) | 256x192 | 53.8 | 63.2 | 0.9 | - | - | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-ucoco_dw-ucoco_270e-256x192-3fd922c8_20230728.pth) |
+| [DWPose-m](../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW l-m](../dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py) | [DW m-m](../dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py) | 256x192 | 60.6 | 69.5 | 2.22 | 13.50 | 4.00 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth) |
+| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py) | 256x192 | 63.1 | 71.7 | 4.52 | 23.41 | 5.67 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth) |
+| [DWPose-l](../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py) | [DW x-l](../dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-384x288.py) | [DW l-l](../dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py) | 384x288 | 66.5 | 74.3 | 10.07 | 44.58 | 7.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-384x288-2438fd99_20230728.pth) |
+
+## Train a model
+
+### Train DWPose with the first stage distillation
+
+```
+bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py 8
+```
+
+### Tansfer the S1 distillation models into regular models
+
+```
+# first stage distillation
+python pth_transfer.py $dis_ckpt $new_pose_ckpt
+```
+
+⭐Before S2 distillation, you should add your model path into 'teacher_pretrained' of your S2 dis_config.
+
+### Train DWPose with the second stage distillation
+
+```
+bash tools/dist_train.sh configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py 8
+```
+
+### Tansfer the S2 distillation models into regular models
+
+```
+# second stage distillation
+python pth_transfer.py $dis_ckpt $new_pose_ckpt --two_dis
+```
+
+## Citation
+
+```
+@article{yang2023effective,
+ title={Effective Whole-body Pose Estimation with Two-stages Distillation},
+ author={Yang, Zhendong and Zeng, Ailing and Yuan, Chun and Li, Yu},
+ journal={arXiv preprint arXiv:2307.15880},
+ year={2023}
+}
+```
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py
new file mode 100644
index 0000000..999ebfc
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_l_dis_m_coco-256x192.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-l_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=768,
+ teacher_channels=1024,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py
new file mode 100644
index 0000000..0ee0144
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s1_dis/dwpose_x_dis_l_coco-384x288.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-x_simcc-coco-wholebody_pt-body7_270e-384x288-401dfc90_20230629.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-x_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=1024,
+ teacher_channels=1280,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py
new file mode 100644
index 0000000..dd0cf5f
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_l-ll_coco-384x288.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_x_dis_l_coco-384x288/dw-x-l_coco_384.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-l_8xb32-270e_coco-wholebody-384x288.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py
new file mode 100644
index 0000000..df3626f
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/coco-wholebody/s2_dis/dwpose_m-mm_coco-256x192.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_l_dis_m_coco-256x192/dw-l-m_coco_256.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/'
+ 'rtmpose-m_8xb64-270e_coco-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py
new file mode 100644
index 0000000..f7c387d
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_m_coco-ubody-256x192.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=768,
+ teacher_channels=1024,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py
new file mode 100644
index 0000000..3d6c4aa
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_s_coco-ubody-256x192.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=512,
+ teacher_channels=1024,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py
new file mode 100644
index 0000000..5325db9
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_l_dis_t_coco-ubody-256x192.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_ucoco_256x192-95bb32f5_20230822.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=384,
+ teacher_channels=1024,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py
new file mode 100644
index 0000000..b1389d3
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/dwpose_x_dis_l_coco-ubody-256x192.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-x_ucoco_256x192-05f5bcb7_20230822.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=1024,
+ teacher_channels=1280,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py
new file mode 100644
index 0000000..948116d
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s1_dis/rtmpose_x_dis_l_coco-ubody-384x288.py
@@ -0,0 +1,48 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = False
+
+# config settings
+fea = True
+logit = True
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ teacher_pretrained='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-x_ucoco_384x288-f5b50679_20230822.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='FeaLoss',
+ name='loss_fea',
+ use_this=fea,
+ student_channels=1024,
+ teacher_channels=1280,
+ alpha_fea=0.00007,
+ )
+ ]),
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=0.1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+)
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py
new file mode 100644
index 0000000..ce28745
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-256x192.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_x_dis_l_coco-ubody-256x192/dw-x-l_ucoco_256.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py
new file mode 100644
index 0000000..b049b50
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_l-ll_coco-ubody-384x288.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_x_dis_l_coco-ubody-384x288/dw-x-l_ucoco_384.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py
new file mode 100644
index 0000000..3050f08
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_m-mm_coco-ubody-256x192.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_l_dis_m_coco-ubody-256x192/dw-l-m_ucoco_256.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py
new file mode 100644
index 0000000..83423d5
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_s-ss_coco-ubody-256x192.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_l_dis_s_coco-ubody-256x192/dw-l-s_ucoco_256.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py
new file mode 100644
index 0000000..be772a7
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/dwpose/ubody/s2_dis/dwpose_t-tt_coco-ubody-256x192.py
@@ -0,0 +1,45 @@
+_base_ = [
+ '../../../rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py' # noqa: E501
+]
+
+# model settings
+find_unused_parameters = True
+
+# dis settings
+second_dis = True
+
+# config settings
+logit = True
+
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# method details
+model = dict(
+ _delete_=True,
+ type='DWPoseDistiller',
+ two_dis=second_dis,
+ teacher_pretrained='work_dirs/'
+ 'dwpose_l_dis_t_coco-ubody-256x192/dw-l-t_ucoco_256.pth', # noqa: E501
+ teacher_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ student_cfg='configs/wholebody_2d_keypoint/rtmpose/ubody/'
+ 'rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py', # noqa: E501
+ distill_cfg=[
+ dict(methods=[
+ dict(
+ type='KDLoss',
+ name='loss_logit',
+ use_this=logit,
+ weight=1,
+ )
+ ]),
+ ],
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ train_cfg=train_cfg,
+)
+
+optim_wrapper = dict(clip_grad=dict(max_norm=1., norm_type=2))
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md
new file mode 100644
index 0000000..bddca2c
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/README.md
@@ -0,0 +1,18 @@
+# RTMPose
+
+Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
+In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
+Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
+To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
+
+## Results and Models
+
+### COCO-WholeBody Dataset
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | Whole AP | Whole AR | Details and Download |
+| :-------: | :--------: | :------: | :------: | :---------------------------------------------------------------------: |
+| RTMPose-m | 256x192 | 0.582 | 0.674 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) |
+| RTMPose-l | 256x192 | 0.611 | 0.700 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) |
+| RTMPose-l | 384x288 | 0.648 | 0.730 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py
new file mode 100644
index 0000000..a037573
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py
@@ -0,0 +1,615 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 1024
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=8192)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ channel_attention=True,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa
+ )),
+ neck=dict(
+ type='CSPNeXtPAFPN',
+ in_channels=[256, 512, 1024],
+ out_channels=None,
+ out_indices=(
+ 1,
+ 2,
+ ),
+ num_csp_blocks=2,
+ expand_ratio=0.5,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU', inplace=True)),
+ head=dict(
+ type='RTMWHead',
+ in_channels=1024,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=1.,
+ label_softmax=True,
+ label_beta=10.,
+ mask=list(range(23, 91)),
+ mask_weight=0.5,
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+ (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco133 = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+ for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+ (24, 19),
+ (25, 22)] + [(i, i - 3)
+ for i in range(26, 136)]
+
+posetrack_coco133 = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+ (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_coco133)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_coco133)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_coco133)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_coco133)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_coco133)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_coco133)
+ ],
+)
+
+dataset_humanart = dict(
+ type='HumanArt21Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart.json',
+ filter_cfg=dict(scenes=['real_human']),
+ data_prefix=dict(img='pose/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=humanart_coco133)
+ ])
+
+ubody_scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+ each = dict(
+ type='UBody2dDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='pose/UBody/images/'),
+ pipeline=[],
+ sample_interval=10)
+ ubody_datasets.append(each)
+
+dataset_ubody = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+ datasets=ubody_datasets,
+ pipeline=[],
+ test_mode=False,
+)
+
+face_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale', padding=1.25),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+ for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+ (42 + i, 45 + i) for i in range(5)
+ ] + [(51 + i, 50 + i)
+ for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+ (64, 62), (65, 63), (67, 64),
+ (68, 65), (69, 66), (71, 67),
+ (72, 68), (73, 69),
+ (75, 70)] + [(76 + i, 71 + i)
+ for i in range(20)]
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=wflw_coco133), *face_pipeline
+ ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mapping_300w_coco133), *face_pipeline
+ ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+ (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+ (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+ (27, 80), (28, 31)]
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=cofw_coco133), *face_pipeline
+ ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+ (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+ (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+ (70, 62), (71, 63), (73, 64),
+ (75, 65), (76, 66), (78, 67),
+ (79, 68), (80, 69),
+ (82, 70)] + [(84 + i, 71 + i)
+ for i in range(20)]
+dataset_lapa = dict(
+ type='LapaDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=lapa_coco133), *face_pipeline
+ ],
+)
+
+dataset_wb = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_body = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_posetrack,
+ dataset_humanart,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_face = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_wflw,
+ dataset_300w,
+ dataset_cofw,
+ dataset_lapa,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+hand_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+ (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+ (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+ (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+ type='InterHand2DDoubleDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_camera.json',
+ joint_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_joint_3d.json',
+ data_prefix=dict(img='interhand2.6m/images/train/'),
+ sample_interval=10,
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=interhand_coco133,
+ ), *hand_pipeline
+ ],
+)
+
+dataset_hand = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_interhand2d],
+ pipeline=[],
+ test_mode=False,
+)
+
+train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=4,
+ pin_memory=False,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/detection/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py
new file mode 100644
index 0000000..e095f3b
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py
@@ -0,0 +1,617 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 320
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=2560)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False,
+ decode_visibility=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ channel_attention=True,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa
+ )),
+ neck=dict(
+ type='CSPNeXtPAFPN',
+ in_channels=[256, 512, 1024],
+ out_channels=None,
+ out_indices=(
+ 1,
+ 2,
+ ),
+ num_csp_blocks=2,
+ expand_ratio=0.5,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU', inplace=True)),
+ head=dict(
+ type='RTMWHead',
+ in_channels=1024,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=1.,
+ label_softmax=True,
+ label_beta=10.,
+ mask=list(range(23, 91)),
+ mask_weight=0.5,
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+ (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco133 = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+ for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+ (24, 19),
+ (25, 22)] + [(i, i - 3)
+ for i in range(26, 136)]
+
+posetrack_coco133 = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+ (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_coco133)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_coco133)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_coco133)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_coco133)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_coco133)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_coco133)
+ ],
+)
+
+dataset_humanart = dict(
+ type='HumanArt21Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart.json',
+ filter_cfg=dict(scenes=['real_human']),
+ data_prefix=dict(img='pose/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=humanart_coco133)
+ ])
+
+ubody_scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+ each = dict(
+ type='UBody2dDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='pose/UBody/images/'),
+ pipeline=[],
+ sample_interval=10)
+ ubody_datasets.append(each)
+
+dataset_ubody = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+ datasets=ubody_datasets,
+ pipeline=[],
+ test_mode=False,
+)
+
+face_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale', padding=1.25),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+ for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+ (42 + i, 45 + i) for i in range(5)
+ ] + [(51 + i, 50 + i)
+ for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+ (64, 62), (65, 63), (67, 64),
+ (68, 65), (69, 66), (71, 67),
+ (72, 68), (73, 69),
+ (75, 70)] + [(76 + i, 71 + i)
+ for i in range(20)]
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=wflw_coco133), *face_pipeline
+ ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mapping_300w_coco133), *face_pipeline
+ ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+ (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+ (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+ (27, 80), (28, 31)]
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=cofw_coco133), *face_pipeline
+ ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+ (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+ (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+ (70, 62), (71, 63), (73, 64),
+ (75, 65), (76, 66), (78, 67),
+ (79, 68), (80, 69),
+ (82, 70)] + [(84 + i, 71 + i)
+ for i in range(20)]
+dataset_lapa = dict(
+ type='LapaDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=lapa_coco133), *face_pipeline
+ ],
+)
+
+dataset_wb = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_body = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_posetrack,
+ dataset_humanart,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_face = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_wflw,
+ dataset_300w,
+ dataset_cofw,
+ dataset_lapa,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+hand_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+ (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+ (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+ (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+ type='InterHand2DDoubleDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_camera.json',
+ joint_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_joint_3d.json',
+ data_prefix=dict(img='interhand2.6m/images/train/'),
+ sample_interval=10,
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=interhand_coco133,
+ ), *hand_pipeline
+ ],
+)
+
+dataset_hand = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_interhand2d],
+ pipeline=[],
+ test_mode=False,
+)
+
+train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=4,
+ pin_memory=False,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/detection/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py
new file mode 100644
index 0000000..cec4872
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py
@@ -0,0 +1,615 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 1024
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=8192)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ channel_attention=True,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/rtmpose-m_simcc-ucoco_dw-ucoco_270e-256x192-c8b76419_20230728.pth' # noqa
+ )),
+ neck=dict(
+ type='CSPNeXtPAFPN',
+ in_channels=[192, 384, 768],
+ out_channels=None,
+ out_indices=(
+ 1,
+ 2,
+ ),
+ num_csp_blocks=2,
+ expand_ratio=0.5,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU', inplace=True)),
+ head=dict(
+ type='RTMWHead',
+ in_channels=768,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=1.,
+ label_softmax=True,
+ label_beta=10.,
+ mask=list(range(23, 91)),
+ mask_weight=0.5,
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+ (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco133 = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+ for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+ (24, 19),
+ (25, 22)] + [(i, i - 3)
+ for i in range(26, 136)]
+
+posetrack_coco133 = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+ (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_coco133)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_coco133)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_coco133)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_coco133)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_coco133)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_coco133)
+ ],
+)
+
+dataset_humanart = dict(
+ type='HumanArt21Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart.json',
+ filter_cfg=dict(scenes=['real_human']),
+ data_prefix=dict(img='pose/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=humanart_coco133)
+ ])
+
+ubody_scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+ each = dict(
+ type='UBody2dDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='pose/UBody/images/'),
+ pipeline=[],
+ sample_interval=10)
+ ubody_datasets.append(each)
+
+dataset_ubody = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+ datasets=ubody_datasets,
+ pipeline=[],
+ test_mode=False,
+)
+
+face_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale', padding=1.25),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+ for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+ (42 + i, 45 + i) for i in range(5)
+ ] + [(51 + i, 50 + i)
+ for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+ (64, 62), (65, 63), (67, 64),
+ (68, 65), (69, 66), (71, 67),
+ (72, 68), (73, 69),
+ (75, 70)] + [(76 + i, 71 + i)
+ for i in range(20)]
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=wflw_coco133), *face_pipeline
+ ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mapping_300w_coco133), *face_pipeline
+ ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+ (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+ (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+ (27, 80), (28, 31)]
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=cofw_coco133), *face_pipeline
+ ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+ (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+ (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+ (70, 62), (71, 63), (73, 64),
+ (75, 65), (76, 66), (78, 67),
+ (79, 68), (80, 69),
+ (82, 70)] + [(84 + i, 71 + i)
+ for i in range(20)]
+dataset_lapa = dict(
+ type='LapaDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=lapa_coco133), *face_pipeline
+ ],
+)
+
+dataset_wb = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_body = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_posetrack,
+ dataset_humanart,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_face = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_wflw,
+ dataset_300w,
+ dataset_cofw,
+ dataset_lapa,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+hand_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+ (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+ (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+ (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+ type='InterHand2DDoubleDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_camera.json',
+ joint_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_joint_3d.json',
+ data_prefix=dict(img='interhand2.6m/images/train/'),
+ sample_interval=10,
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=interhand_coco133,
+ ), *hand_pipeline
+ ],
+)
+
+dataset_hand = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_interhand2d],
+ pipeline=[],
+ test_mode=False,
+)
+
+train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=4,
+ pin_memory=False,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/detection/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py
new file mode 100644
index 0000000..e5f1754
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py
@@ -0,0 +1,617 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 320
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=2560)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False,
+ decode_visibility=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ channel_attention=True,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa
+ )),
+ neck=dict(
+ type='CSPNeXtPAFPN',
+ in_channels=[320, 640, 1280],
+ out_channels=None,
+ out_indices=(
+ 1,
+ 2,
+ ),
+ num_csp_blocks=2,
+ expand_ratio=0.5,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU', inplace=True)),
+ head=dict(
+ type='RTMWHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=1.,
+ label_softmax=True,
+ label_beta=10.,
+ mask=list(range(23, 91)),
+ mask_weight=0.5,
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+ (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco133 = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+ for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+ (24, 19),
+ (25, 22)] + [(i, i - 3)
+ for i in range(26, 136)]
+
+posetrack_coco133 = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+ (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_coco133)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_coco133)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_coco133)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_coco133)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_coco133)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_coco133)
+ ],
+)
+
+dataset_humanart = dict(
+ type='HumanArt21Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart.json',
+ filter_cfg=dict(scenes=['real_human']),
+ data_prefix=dict(img='pose/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=humanart_coco133)
+ ])
+
+ubody_scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+ each = dict(
+ type='UBody2dDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='pose/UBody/images/'),
+ pipeline=[],
+ sample_interval=10)
+ ubody_datasets.append(each)
+
+dataset_ubody = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+ datasets=ubody_datasets,
+ pipeline=[],
+ test_mode=False,
+)
+
+face_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale', padding=1.25),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+ for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+ (42 + i, 45 + i) for i in range(5)
+ ] + [(51 + i, 50 + i)
+ for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+ (64, 62), (65, 63), (67, 64),
+ (68, 65), (69, 66), (71, 67),
+ (72, 68), (73, 69),
+ (75, 70)] + [(76 + i, 71 + i)
+ for i in range(20)]
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=wflw_coco133), *face_pipeline
+ ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mapping_300w_coco133), *face_pipeline
+ ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+ (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+ (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+ (27, 80), (28, 31)]
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=cofw_coco133), *face_pipeline
+ ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+ (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+ (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+ (70, 62), (71, 63), (73, 64),
+ (75, 65), (76, 66), (78, 67),
+ (79, 68), (80, 69),
+ (82, 70)] + [(84 + i, 71 + i)
+ for i in range(20)]
+dataset_lapa = dict(
+ type='LapaDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=lapa_coco133), *face_pipeline
+ ],
+)
+
+dataset_wb = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_body = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_posetrack,
+ dataset_humanart,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_face = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_wflw,
+ dataset_300w,
+ dataset_cofw,
+ dataset_lapa,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+hand_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+ (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+ (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+ (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+ type='InterHand2DDoubleDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_camera.json',
+ joint_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_joint_3d.json',
+ data_prefix=dict(img='interhand2.6m/images/train/'),
+ sample_interval=10,
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=interhand_coco133,
+ ), *hand_pipeline
+ ],
+)
+
+dataset_hand = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_interhand2d],
+ pipeline=[],
+ test_mode=False,
+)
+
+train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=4,
+ pin_memory=False,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/detection/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py
new file mode 100644
index 0000000..d9852cf
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py
@@ -0,0 +1,615 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+train_batch_size = 704
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.1),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=5632)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ channel_attention=True,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/'
+ 'wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-256x192-05f5bcb7_20230822.pth' # noqa
+ )),
+ neck=dict(
+ type='CSPNeXtPAFPN',
+ in_channels=[320, 640, 1280],
+ out_channels=None,
+ out_indices=(
+ 1,
+ 2,
+ ),
+ num_csp_blocks=2,
+ expand_ratio=0.5,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU', inplace=True)),
+ head=dict(
+ type='RTMWHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=input_size,
+ in_featuremap_size=tuple([s // 32 for s in input_size]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=1.,
+ label_softmax=True,
+ label_beta=10.,
+ mask=list(range(23, 91)),
+ mask_weight=0.5,
+ ),
+ decoder=codec),
+ test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PhotometricDistortion'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ ]),
+ dict(
+ type='GenerateTarget',
+ encoder=codec,
+ use_dataset_keypoint_weights=True),
+ dict(type='PackPoseInputs')
+]
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+ (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+ (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+ (0, 16),
+ (1, 14),
+ (2, 12),
+ (3, 11),
+ (4, 13),
+ (5, 15),
+ (10, 10),
+ (11, 8),
+ (12, 6),
+ (13, 5),
+ (14, 7),
+ (15, 9),
+]
+
+jhmdb_coco133 = [
+ (3, 6),
+ (4, 5),
+ (5, 12),
+ (6, 11),
+ (7, 8),
+ (8, 7),
+ (9, 14),
+ (10, 13),
+ (11, 10),
+ (12, 9),
+ (13, 16),
+ (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+ for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+ (24, 19),
+ (25, 22)] + [(i, i - 3)
+ for i in range(26, 136)]
+
+posetrack_coco133 = [
+ (0, 0),
+ (3, 3),
+ (4, 4),
+ (5, 5),
+ (6, 6),
+ (7, 7),
+ (8, 8),
+ (9, 9),
+ (10, 10),
+ (11, 11),
+ (12, 12),
+ (13, 13),
+ (14, 14),
+ (15, 15),
+ (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+ (19, 17), (20, 20)]
+
+# train datasets
+dataset_coco = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='detection/coco/train2017/'),
+ pipeline=[],
+)
+
+dataset_aic = dict(
+ type='AicDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='aic/annotations/aic_train.json',
+ data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+ '_train_20170902/keypoint_train_images_20170902/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=aic_coco133)
+ ],
+)
+
+dataset_crowdpose = dict(
+ type='CrowdPoseDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+ data_prefix=dict(img='pose/CrowdPose/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=crowdpose_coco133)
+ ],
+)
+
+dataset_mpii = dict(
+ type='MpiiDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='mpii/annotations/mpii_train.json',
+ data_prefix=dict(img='pose/MPI/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mpii_coco133)
+ ],
+)
+
+dataset_jhmdb = dict(
+ type='JhmdbDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='jhmdb/annotations/Sub1_train.json',
+ data_prefix=dict(img='pose/JHMDB/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=jhmdb_coco133)
+ ],
+)
+
+dataset_halpe = dict(
+ type='HalpeDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=halpe_coco133)
+ ],
+)
+
+dataset_posetrack = dict(
+ type='PoseTrack18Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='posetrack18/annotations/posetrack18_train.json',
+ data_prefix=dict(img='pose/PoseChallenge2018/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=posetrack_coco133)
+ ],
+)
+
+dataset_humanart = dict(
+ type='HumanArt21Dataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='HumanArt/annotations/training_humanart.json',
+ filter_cfg=dict(scenes=['real_human']),
+ data_prefix=dict(img='pose/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=humanart_coco133)
+ ])
+
+ubody_scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+ubody_datasets = []
+for scene in ubody_scenes:
+ each = dict(
+ type='UBody2dDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'Ubody/annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='pose/UBody/images/'),
+ pipeline=[],
+ sample_interval=10)
+ ubody_datasets.append(each)
+
+dataset_ubody = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/ubody2d.py'),
+ datasets=ubody_datasets,
+ pipeline=[],
+ test_mode=False,
+)
+
+face_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale', padding=1.25),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+ for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+ (42 + i, 45 + i) for i in range(5)
+ ] + [(51 + i, 50 + i)
+ for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+ (64, 62), (65, 63), (67, 64),
+ (68, 65), (69, 66), (71, 67),
+ (72, 68), (73, 69),
+ (75, 70)] + [(76 + i, 71 + i)
+ for i in range(20)]
+dataset_wflw = dict(
+ type='WFLWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+ data_prefix=dict(img='pose/WFLW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=wflw_coco133), *face_pipeline
+ ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+ type='Face300WDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='300w/annotations/face_landmarks_300w_train.json',
+ data_prefix=dict(img='pose/300w/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=mapping_300w_coco133), *face_pipeline
+ ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+ (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+ (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+ (27, 80), (28, 31)]
+dataset_cofw = dict(
+ type='COFWDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='cofw/annotations/cofw_train.json',
+ data_prefix=dict(img='pose/COFW/images/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=cofw_coco133), *face_pipeline
+ ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+ (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+ (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+ (70, 62), (71, 63), (73, 64),
+ (75, 65), (76, 66), (78, 67),
+ (79, 68), (80, 69),
+ (82, 70)] + [(84 + i, 71 + i)
+ for i in range(20)]
+dataset_lapa = dict(
+ type='LapaDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='LaPa/annotations/lapa_trainval.json',
+ data_prefix=dict(img='pose/LaPa/'),
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=lapa_coco133), *face_pipeline
+ ],
+)
+
+dataset_wb = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_coco, dataset_halpe, dataset_ubody],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_body = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_aic,
+ dataset_crowdpose,
+ dataset_mpii,
+ dataset_jhmdb,
+ dataset_posetrack,
+ dataset_humanart,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+dataset_face = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[
+ dataset_wflw,
+ dataset_300w,
+ dataset_cofw,
+ dataset_lapa,
+ ],
+ pipeline=[],
+ test_mode=False,
+)
+
+hand_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[1.5, 2.0],
+ rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+ (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+ (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+ (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+ type='InterHand2DDoubleDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='interhand26m/annotations/all/InterHand2.6M_train_data.json',
+ camera_param_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_camera.json',
+ joint_file='interhand26m/annotations/all/'
+ 'InterHand2.6M_train_joint_3d.json',
+ data_prefix=dict(img='interhand2.6m/images/train/'),
+ sample_interval=10,
+ pipeline=[
+ dict(
+ type='KeypointConverter',
+ num_keypoints=num_keypoints,
+ mapping=interhand_coco133,
+ ), *hand_pipeline
+ ],
+)
+
+dataset_hand = dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=[dataset_interhand2d],
+ pipeline=[],
+ test_mode=False,
+)
+
+train_datasets = [dataset_wb, dataset_body, dataset_face, dataset_hand]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=4,
+ pin_memory=False,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/detection/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md
new file mode 100644
index 0000000..678bc64
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.md
@@ -0,0 +1,80 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+- `Cocktail14` denotes model trained on 14 public datasets:
+ - [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
+ - [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
+ - [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
+ - [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
+ - [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
+ - [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
+ - [COCO-Wholebody](https://github.com/jin-s13/COCO-WholeBody/)
+ - [UBody](https://github.com/IDEA-Research/OSX)
+ - [Human-Art](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#human-art-dataset)
+ - [WFLW](https://wywu.github.io/projects/LAB/WFLW.html)
+ - [300W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+ - [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/)
+ - [LaPa](https://github.com/JDAI-CV/lapa-dataset)
+ - [InterHand](https://mks0601.github.io/InterHand2.6M/)
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------------------------: | :-: |
+| [rtmw-m](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py) | 256x192 | 0.676 | 0.747 | 0.671 | 0.794 | 0.783 | 0.854 | 0.491 | 0.604 | 0.582 | 0.673 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-l-m_simcc-cocktail14_270e-256x192-20231122.pth) | - |
+| [rtmw-l](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py) | 256x192 | 0.743 | 0.807 | 0.763 | 0.868 | 0.834 | 0.889 | 0.598 | 0.701 | 0.660 | 0.746 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-256x192-20231122.pth) | - |
+| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py) | 256x192 | 0.746 | 0.808 | 0.770 | 0.869 | 0.844 | 0.896 | 0.610 | 0.710 | 0.672 | 0.752 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-256x192-13a2546d_20231208.pth) | - |
+| [rtmw-l](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py) | 384x288 | 0.761 | 0.824 | 0.793 | 0.885 | 0.884 | 0.921 | 0.663 | 0.752 | 0.701 | 0.780 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-384x288-20231122.pth) | - |
+| [rtmw-x](/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py) | 384x288 | 0.763 | 0.826 | 0.796 | 0.888 | 0.884 | 0.923 | 0.664 | 0.755 | 0.702 | 0.781 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288-f840f204_20231122.pth) | - |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml
new file mode 100644
index 0000000..4d84c88
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw_cocktail14.yml
@@ -0,0 +1,108 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-m_8xb1024-270e_cocktail14-256x192.py
+ In Collection: RTMPose
+ Alias: wholebody
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: COCO-WholeBody
+ Name: rtmw-m_8xb1024-270e_cocktail14-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.676
+ Body AR: 0.747
+ Face AP: 0.783
+ Face AR: 0.854
+ Foot AP: 0.671
+ Foot AR: 0.794
+ Hand AP: 0.491
+ Hand AR: 0.604
+ Whole AP: 0.582
+ Whole AR: 0.673
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-l-m_simcc-cocktail14_270e-256x192-20231122.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb1024-270e_cocktail14-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmw-l_8xb1024-270e_cocktail14-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.743
+ Body AR: 0.807
+ Face AP: 0.834
+ Face AR: 0.889
+ Foot AP: 0.763
+ Foot AR: 0.868
+ Hand AP: 0.598
+ Hand AR: 0.701
+ Whole AP: 0.660
+ Whole AR: 0.746
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-256x192-20231122.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb704-270e_cocktail14-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmw-x_8xb704-270e_cocktail14-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.746
+ Body AR: 0.808
+ Face AP: 0.844
+ Face AR: 0.896
+ Foot AP: 0.770
+ Foot AR: 0.869
+ Hand AP: 0.610
+ Hand AR: 0.710
+ Whole AP: 0.672
+ Whole AR: 0.752
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-256x192-13a2546d_20231208.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-l_8xb320-270e_cocktail14-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmw-l_8xb320-270e_cocktail14-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.761
+ Body AR: 0.824
+ Face AP: 0.884
+ Face AR: 0.921
+ Foot AP: 0.793
+ Foot AR: 0.885
+ Hand AP: 0.663
+ Hand AR: 0.752
+ Whole AP: 0.701
+ Whole AR: 0.780
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-dw-x-l_simcc-cocktail14_270e-384x288-20231122.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/cocktail14/rtmw-x_8xb320-270e_cocktail14-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmw-x_8xb320-270e_cocktail14-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.763
+ Body AR: 0.826
+ Face AP: 0.884
+ Face AR: 0.923
+ Foot AP: 0.796
+ Foot AR: 0.888
+ Hand AP: 0.664
+ Hand AR: 0.755
+ Whole AP: 0.702
+ Whole AR: 0.781
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmw/rtmw-x_simcc-cocktail14_pt-ucoco_270e-384x288-f840f204_20231122.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..9c881a9
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..5c57c56
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..2abf14b
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py
@@ -0,0 +1,232 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..7831708
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-x_8xb32-270e_coco-wholebody-384x288.py
@@ -0,0 +1,233 @@
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 32
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md
new file mode 100644
index 0000000..93c8434
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md
@@ -0,0 +1,62 @@
+
+
+
+RTMPose (arXiv'2023)
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2303.07399,
+ doi = {10.48550/ARXIV.2303.07399},
+ url = {https://arxiv.org/abs/2303.07399},
+ author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+ title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
+ publisher = {arXiv},
+ year = {2023},
+ copyright = {Creative Commons Attribution 4.0 International}
+}
+
+```
+
+
+
+
+
+
+RTMDet (arXiv'2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [rtmpose-m](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.673 | 0.750 | 0.615 | 0.752 | 0.813 | 0.871 | 0.475 | 0.589 | 0.582 | 0.674 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.json) |
+| [rtmpose-l](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.695 | 0.769 | 0.658 | 0.785 | 0.833 | 0.887 | 0.519 | 0.628 | 0.611 | 0.700 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.json) |
+| [rtmpose-l](/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 0.712 | 0.781 | 0.693 | 0.811 | 0.882 | 0.919 | 0.579 | 0.677 | 0.648 | 0.730 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml
new file mode 100644
index 0000000..e2f6b51
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.yml
@@ -0,0 +1,66 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py
+ In Collection: RTMPose
+ Alias: wholebody
+ Metadata:
+ Architecture: &id001
+ - RTMPose
+ Training Data: COCO-WholeBody
+ Name: rtmpose-m_8xb64-270e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.673
+ Body AR: 0.750
+ Face AP: 0.813
+ Face AR: 0.871
+ Foot AP: 0.615
+ Foot AR: 0.752
+ Hand AP: 0.475
+ Hand AR: 0.589
+ Whole AP: 0.582
+ Whole AR: 0.674
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmpose-l_8xb64-270e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.695
+ Body AR: 0.769
+ Face AP: 0.833
+ Face AR: 0.887
+ Foot AP: 0.658
+ Foot AR: 0.785
+ Hand AP: 0.519
+ Hand AR: 0.628
+ Whole AP: 0.611
+ Whole AR: 0.700
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth
+- Config: configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py
+ In Collection: RTMPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: rtmpose-l_8xb32-270e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.712
+ Body AR: 0.781
+ Face AP: 0.882
+ Face AR: 0.919
+ Foot AP: 0.693
+ Foot AR: 0.811
+ Hand AP: 0.579
+ Hand AR: 0.677
+ Whole AP: 0.648
+ Whole AR: 0.730
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
new file mode 100644
index 0000000..e8b4bfb
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py
@@ -0,0 +1,256 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 32
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(288, 384),
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=(9, 12),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py
new file mode 100644
index 0000000..fb7f783
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py
@@ -0,0 +1,256 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1024,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py
new file mode 100644
index 0000000..6fe5fc4
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-m_8xb64-270e_coco-ubody-wholebody-256x192.py
@@ -0,0 +1,256 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=768,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py
new file mode 100644
index 0000000..1620793
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-s_8xb64-270e_coco-ubody-wholebody-256x192.py
@@ -0,0 +1,256 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=512,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py
new file mode 100644
index 0000000..fb90798
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-t_8xb64-270e_coco-ubody-wholebody-256x192.py
@@ -0,0 +1,256 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ # use cosine lr from 150 to 300 epoch
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.167,
+ widen_factor=0.375,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=384,
+ out_channels=133,
+ input_size=codec['input_size'],
+ in_featuremap_size=(6, 8),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py
new file mode 100644
index 0000000..71330c1
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb32-270e_coco-ubody-wholebody-384x288.py
@@ -0,0 +1,260 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (288, 384)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 32
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=input_size,
+ sigma=(6., 6.93),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py
new file mode 100644
index 0000000..44d13e2
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_8xb64-270e_coco-ubody-wholebody-256x192.py
@@ -0,0 +1,260 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# common setting
+num_keypoints = 133
+input_size = (192, 256)
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_batch_size = 64
+val_batch_size = 32
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ clip_grad=dict(max_norm=35, norm_type=2),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='SimCCLabel',
+ input_size=(192, 256),
+ sigma=(4.9, 5.66),
+ simcc_split_ratio=2.0,
+ normalize=False,
+ use_dark=False)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.33,
+ widen_factor=1.25,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+ 'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa: E501
+ )),
+ head=dict(
+ type='RTMCCHead',
+ in_channels=1280,
+ out_channels=num_keypoints,
+ input_size=codec['input_size'],
+ in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+ simcc_split_ratio=codec['simcc_split_ratio'],
+ final_layer_kernel_size=7,
+ gau_cfg=dict(
+ hidden_dims=256,
+ s=128,
+ expansion_factor=2,
+ dropout_rate=0.,
+ drop_path=0.,
+ act_fn='SiLU',
+ use_rel_bias=False,
+ pos_enc=False),
+ loss=dict(
+ type='KLDiscretLoss',
+ use_target_weight=True,
+ beta=10.,
+ label_softmax=True),
+ decoder=codec),
+ test_cfg=dict(flip_test=True, ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+backend_args = dict(backend='local')
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.5, 1.5],
+ rotate_factor=90),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=train_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+
+val_dataloader = dict(
+ batch_size=val_batch_size,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ data_prefix=dict(img='coco/val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md
new file mode 100644
index 0000000..c60db05
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/README.md
@@ -0,0 +1,35 @@
+# Top-down heatmap-based pose estimation
+
+Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. Instead of estimating keypoint coordinates directly, the pose estimator will produce heatmaps which represent the likelihood of being a keypoint, following the paradigm introduced in [Simple Baselines for Human Pose Estimation and Tracking](http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html).
+
+
+

+
+
+## Results and Models
+
+### COCO-WholeBody Dataset
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Model | Input Size | Whole AP | Whole AR | Details and Download |
+| :-----------------: | :--------: | :------: | :------: | :-----------------------------------------------------------------------------: |
+| HRNet-w48+Dark+ | 384x288 | 0.661 | 0.743 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) |
+| HRNet-w32+Dark | 256x192 | 0.582 | 0.671 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) |
+| HRNet-w48 | 256x192 | 0.579 | 0.681 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) |
+| CSPNeXt-m | 256x192 | 0.567 | 0.641 | [cspnext_udp_coco-wholebody.md](./coco-wholebody/cspnext_udp_coco-wholebody.md) |
+| HRNet-w32 | 256x192 | 0.549 | 0.646 | [hrnet_ubody-coco-wholebody.md](./ubody2d/hrnet_ubody-coco-wholebody.md) |
+| ResNet-152 | 256x192 | 0.548 | 0.661 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) |
+| HRNet-w32 | 256x192 | 0.536 | 0.636 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) |
+| ResNet-101 | 256x192 | 0.531 | 0.645 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) |
+| S-ViPNAS-Res50+Dark | 256x192 | 0.528 | 0.632 | [vipnas_dark_coco-wholebody.md](./coco-wholebody/vipnas_dark_coco-wholebody.md) |
+| ResNet-50 | 256x192 | 0.521 | 0.633 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) |
+| S-ViPNAS-Res50 | 256x192 | 0.495 | 0.607 | [vipnas_coco-wholebody.md](./coco-wholebody/vipnas_coco-wholebody.md) |
+
+### UBody2D Dataset
+
+Result on UBody val set, computed with gt keypoints.
+
+| Model | Input Size | Whole AP | Whole AR | Details and Download |
+| :-------: | :--------: | :------: | :------: | :----------------------------------------------------------------------: |
+| HRNet-w32 | 256x192 | 0.690 | 0.729 | [hrnet_ubody-coco-wholebody.md](./ubody2d/hrnet_ubody-coco-wholebody.md) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..aa98e5a
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,212 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=1.,
+ widen_factor=1.,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=1024,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..d6d1c2f
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,212 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+max_epochs = 210
+stage2_num_epochs = 30
+base_lr = 4e-3
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+
+# optimizer
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+ paramwise_cfg=dict(
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1.0e-5,
+ by_epoch=False,
+ begin=0,
+ end=1000),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=base_lr * 0.05,
+ begin=max_epochs // 2,
+ end=max_epochs,
+ T_max=max_epochs // 2,
+ by_epoch=True,
+ convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# codec settings
+codec = dict(
+ type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ _scope_='mmdet',
+ type='CSPNeXt',
+ arch='P5',
+ expand_ratio=0.5,
+ deepen_factor=0.67,
+ widen_factor=0.75,
+ out_indices=(4, ),
+ channel_attention=True,
+ norm_cfg=dict(type='SyncBN'),
+ act_cfg=dict(type='SiLU'),
+ init_cfg=dict(
+ type='Pretrained',
+ prefix='backbone.',
+ checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+ 'rtmdet/cspnext_rsb_pretrain/'
+ 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=768,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=False,
+ flip_mode='heatmap',
+ shift_heatmap=False,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+backend_args = dict(backend='local')
+# backend_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+# }))
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=1.0),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImage', backend_args=backend_args),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ shift_factor=0.,
+ scale_factor=[0.75, 1.25],
+ rotate_factor=60),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='mmdet.YOLOXHSVRandomAug'),
+ dict(
+ type='Albumentation',
+ transforms=[
+ dict(type='Blur', p=0.1),
+ dict(type='MedianBlur', p=0.1),
+ dict(
+ type='CoarseDropout',
+ max_holes=1,
+ max_height=0.4,
+ max_width=0.4,
+ min_holes=1,
+ min_height=0.2,
+ min_width=0.2,
+ p=0.5),
+ ]),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=10,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=10,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='mmdet.PipelineSwitchHook',
+ switch_epoch=max_epochs - stage2_num_epochs,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+# evaluators
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md
new file mode 100644
index 0000000..7f8e000
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md
@@ -0,0 +1,56 @@
+
+
+
+RTMDet (ArXiv 2022)
+
+```bibtex
+@misc{lyu2022rtmdet,
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
+ year={2022},
+ eprint={2212.07784},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+
+
+
+
+
+UDP (CVPR'2020)
+
+```bibtex
+@InProceedings{Huang_2020_CVPR,
+ author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan},
+ title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation},
+ booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [pose_cspnext_m_udp](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.687 | 0.735 | 0.680 | 0.763 | 0.697 | 0.755 | 0.460 | 0.543 | 0.567 | 0.641 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml
new file mode 100644
index 0000000..bdcb4c5
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.yml
@@ -0,0 +1,24 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py
+ In Collection: UDP
+ Metadata:
+ Architecture: &id001
+ - UDP
+ - CSPNeXt
+ Training Data: COCO-WholeBody
+ Name: cspnext-m_udp_8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.687
+ Body AR: 0.735
+ Face AP: 0.697
+ Face AR: 0.755
+ Foot AP: 0.680
+ Foot AR: 0.763
+ Hand AP: 0.46
+ Hand AR: 0.567
+ Whole AP: 0.567
+ Whole AR: 0.641
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md
new file mode 100644
index 0000000..8dd01d5
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.md
@@ -0,0 +1,41 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.678 | 0.755 | 0.543 | 0.661 | 0.630 | 0.708 | 0.467 | 0.566 | 0.536 | 0.636 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_20200918.log.json) |
+| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py) | 384x288 | 0.700 | 0.772 | 0.585 | 0.691 | 0.726 | 0.783 | 0.515 | 0.603 | 0.586 | 0.673 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288_20200922.log.json) |
+| [pose_hrnet_w48](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.701 | 0.776 | 0.675 | 0.787 | 0.656 | 0.743 | 0.535 | 0.639 | 0.579 | 0.681 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192_20200922.log.json) |
+| [pose_hrnet_w48](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.722 | 0.791 | 0.696 | 0.801 | 0.776 | 0.834 | 0.587 | 0.678 | 0.632 | 0.717 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_20200922.log.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
new file mode 100644
index 0000000..2cee2ac
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
@@ -0,0 +1,86 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.678
+ Body AR: 0.755
+ Face AP: 0.630
+ Face AR: 0.708
+ Foot AP: 0.543
+ Foot AR: 0.661
+ Hand AP: 0.467
+ Hand AR: 0.566
+ Whole AP: 0.536
+ Whole AR: 0.636
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192-853765cd_20200918.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.700
+ Body AR: 0.772
+ Face AP: 0.726
+ Face AR: 0.783
+ Foot AP: 0.585
+ Foot AR: 0.691
+ Hand AP: 0.515
+ Hand AR: 0.603
+ Whole AP: 0.586
+ Whole AR: 0.673
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_384x288-78cacac3_20200922.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.701
+ Body AR: 0.776
+ Face AP: 0.656
+ Face AR: 0.743
+ Foot AP: 0.675
+ Foot AR: 0.787
+ Hand AP: 0.535
+ Hand AR: 0.639
+ Whole AP: 0.579
+ Whole AR: 0.681
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_256x192-643e18cb_20200922.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.722
+ Body AR: 0.791
+ Face AP: 0.776
+ Face AR: 0.834
+ Foot AP: 0.696
+ Foot AR: 0.801
+ Hand AP: 0.587
+ Hand AR: 0.678
+ Whole AP: 0.632
+ Whole AR: 0.717
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288-6e061c6a_20200922.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md
new file mode 100644
index 0000000..fa4bc27
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.md
@@ -0,0 +1,58 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [pose_hrnet_w32_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.693 | 0.764 | 0.564 | 0.674 | 0.737 | 0.809 | 0.503 | 0.602 | 0.582 | 0.671 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark_20200922.log.json) |
+| [pose_hrnet_w48_dark+](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.742 | 0.807 | 0.707 | 0.806 | 0.841 | 0.892 | 0.602 | 0.694 | 0.661 | 0.743 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark_20200918.log.json) |
+
+Note: `+` means the model is first pre-trained on original COCO dataset, and then fine-tuned on COCO-WholeBody dataset. We find this will lead to better performance.
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
new file mode 100644
index 0000000..25a22cc
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
@@ -0,0 +1,45 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ - DarkPose
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.693
+ Body AR: 0.764
+ Face AP: 0.737
+ Face AR: 0.809
+ Foot AP: 0.564
+ Foot AR: 0.674
+ Hand AP: 0.503
+ Hand AR: 0.602
+ Whole AP: 0.582
+ Whole AR: 0.671
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_wholebody_256x192_dark-469327ef_20200922.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py
+ In Collection: DarkPose
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.742
+ Body AR: 0.807
+ Face AP: 0.841
+ Face AR: 0.892
+ Foot AP: 0.707
+ Foot AR: 0.806
+ Hand AP: 0.602
+ Hand AR: 0.694
+ Whole AP: 0.661
+ Whole AR: 0.743
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md
new file mode 100644
index 0000000..187e5d3
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.md
@@ -0,0 +1,43 @@
+
+
+
+SimpleBaseline2D (ECCV'2018)
+
+```bibtex
+@inproceedings{xiao2018simple,
+ title={Simple baselines for human pose estimation and tracking},
+ author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={466--481},
+ year={2018}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [pose_resnet_50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.652 | 0.738 | 0.615 | 0.749 | 0.606 | 0.715 | 0.460 | 0.584 | 0.521 | 0.633 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py) | 384x288 | 0.666 | 0.747 | 0.634 | 0.763 | 0.731 | 0.811 | 0.536 | 0.646 | 0.574 | 0.670 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288_20201004.log.json) |
+| [pose_resnet_101](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.669 | 0.753 | 0.637 | 0.766 | 0.611 | 0.722 | 0.463 | 0.589 | 0.531 | 0.645 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_101](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.692 | 0.770 | 0.680 | 0.799 | 0.746 | 0.820 | 0.548 | 0.657 | 0.597 | 0.693 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288_20201004.log.json) |
+| [pose_resnet_152](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py) | 256x192 | 0.682 | 0.764 | 0.661 | 0.787 | 0.623 | 0.728 | 0.481 | 0.607 | 0.548 | 0.661 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192_20201004.log.json) |
+| [pose_resnet_152](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py) | 384x288 | 0.704 | 0.780 | 0.693 | 0.813 | 0.751 | 0.824 | 0.559 | 0.666 | 0.610 | 0.705 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288_20201004.log.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
new file mode 100644
index 0000000..c4c148a
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
@@ -0,0 +1,128 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: &id001
+ - SimpleBaseline2D
+ Training Data: COCO-WholeBody
+ Name: td-hm_res50_8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.652
+ Body AR: 0.738
+ Face AP: 0.606
+ Face AR: 0.715
+ Foot AP: 0.615
+ Foot AR: 0.749
+ Hand AP: 0.46
+ Hand AR: 0.584
+ Whole AP: 0.521
+ Whole AR: 0.633
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_256x192-9e37ed88_20201004.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_res50_8xb64-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.666
+ Body AR: 0.747
+ Face AP: 0.731
+ Face AR: 0.811
+ Foot AP: 0.634
+ Foot AR: 0.763
+ Hand AP: 0.536
+ Hand AR: 0.646
+ Whole AP: 0.574
+ Whole AR: 0.67
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_wholebody_384x288-ce11e294_20201004.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_res101_8xb32-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.669
+ Body AR: 0.753
+ Face AP: 0.611
+ Face AR: 0.722
+ Foot AP: 0.637
+ Foot AR: 0.766
+ Hand AP: 0.463
+ Hand AR: 0.589
+ Whole AP: 0.531
+ Whole AR: 0.645
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_256x192-7325f982_20201004.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_res101_8xb32-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.692
+ Body AR: 0.77
+ Face AP: 0.746
+ Face AR: 0.82
+ Foot AP: 0.68
+ Foot AR: 0.799
+ Hand AP: 0.548
+ Hand AR: 0.657
+ Whole AP: 0.598
+ Whole AR: 0.691
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_wholebody_384x288-6c137b9a_20201004.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_res152_8xb32-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.682
+ Body AR: 0.764
+ Face AP: 0.623
+ Face AR: 0.728
+ Foot AP: 0.661
+ Foot AR: 0.787
+ Hand AP: 0.481
+ Hand AR: 0.607
+ Whole AP: 0.548
+ Whole AR: 0.661
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_256x192-5de8ae23_20201004.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py
+ In Collection: SimpleBaseline2D
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_res152_8xb32-210e_coco-wholebody-384x288
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.704
+ Body AR: 0.78
+ Face AP: 0.751
+ Face AR: 0.824
+ Foot AP: 0.693
+ Foot AR: 0.813
+ Hand AP: 0.559
+ Hand AR: 0.666
+ Whole AP: 0.61
+ Whole AR: 0.705
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_wholebody_384x288-eab8caa8_20201004.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..339581e
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..677fdc7
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..3717831
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..5c53b7c
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..ef25d2a
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py
@@ -0,0 +1,150 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..d77872c
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py
@@ -0,0 +1,154 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(288, 384),
+ heatmap_size=(72, 96),
+ sigma=3,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w48-8ef0771d.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=48,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..87c273c
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..5e58a16
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=256)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..3ce4936
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..a92c4d2
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=152,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet152'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..127c322
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py
new file mode 100644
index 0000000..88a88e2
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py
@@ -0,0 +1,121 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(288, 384), heatmap_size=(72, 96), sigma=3)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=2048,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..b39adf9
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,122 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ViPNAS_MobileNetV3'),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=160,
+ out_channels=133,
+ deconv_out_channels=(160, 160, 160),
+ deconv_num_groups=(160, 160, 160),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..851c04a
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,126 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(type='ViPNAS_MobileNetV3'),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=160,
+ out_channels=133,
+ deconv_out_channels=(160, 160, 160),
+ deconv_num_groups=(160, 160, 160),
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..24c7578
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,123 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ViPNAS_ResNet',
+ depth=50,
+ ),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=608,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py
new file mode 100644
index 0000000..585e3dc
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py
@@ -0,0 +1,127 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap',
+ input_size=(192, 256),
+ heatmap_size=(48, 64),
+ sigma=2,
+ unbiased=True)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='ViPNAS_ResNet',
+ depth=50,
+ ),
+ head=dict(
+ type='ViPNASHead',
+ in_channels=608,
+ out_channels=133,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = 'data/coco/'
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(
+ type='RandomBBoxTransform',
+ rotate_factor=60,
+ scale_factor=(0.75, 1.25)),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=train_pipeline,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ pipeline=val_pipeline,
+ ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
new file mode 100644
index 0000000..13b0321
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.md
@@ -0,0 +1,38 @@
+
+
+
+ViPNAS (CVPR'2021)
+
+```bibtex
+@article{xu2021vipnas,
+ title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+ author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year={2021}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [S-ViPNAS-MobileNetV3](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.619 | 0.700 | 0.477 | 0.608 | 0.585 | 0.689 | 0.386 | 0.505 | 0.473 | 0.578 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_20211205.log.json) |
+| [S-ViPNAS-Res50](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.643 | 0.726 | 0.553 | 0.694 | 0.587 | 0.698 | 0.410 | 0.529 | 0.495 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_20211112.log.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
new file mode 100644
index 0000000..cae2a9a
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
@@ -0,0 +1,44 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: &id001
+ - ViPNAS
+ Training Data: COCO-WholeBody
+ Name: td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.619
+ Body AR: 0.7
+ Face AP: 0.585
+ Face AR: 0.689
+ Foot AP: 0.477
+ Foot AR: 0.608
+ Hand AP: 0.386
+ Hand AR: 0.505
+ Whole AP: 0.473
+ Whole AR: 0.578
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192-0fee581a_20211205.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.643
+ Body AR: 0.726
+ Face AP: 0.587
+ Face AR: 0.698
+ Foot AP: 0.553
+ Foot AR: 0.694
+ Hand AP: 0.41
+ Hand AR: 0.529
+ Whole AP: 0.495
+ Whole AR: 0.607
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192-49e1c3a4_20211112.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
new file mode 100644
index 0000000..6bc5624
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.md
@@ -0,0 +1,55 @@
+
+
+
+ViPNAS (CVPR'2021)
+
+```bibtex
+@article{xu2021vipnas,
+ title={ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search},
+ author={Xu, Lumin and Guan, Yingda and Jin, Sheng and Liu, Wentao and Qian, Chen and Luo, Ping and Ouyang, Wanli and Wang, Xiaogang},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year={2021}
+}
+```
+
+
+
+
+
+
+DarkPose (CVPR'2020)
+
+```bibtex
+@inproceedings{zhang2020distribution,
+ title={Distribution-aware coordinate representation for human pose estimation},
+ author={Zhang, Feng and Zhu, Xiatian and Dai, Hanbin and Ye, Mao and Zhu, Ce},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={7093--7102},
+ year={2020}
+}
+```
+
+
+
+
+
+
+COCO-WholeBody (ECCV'2020)
+
+```bibtex
+@inproceedings{jin2020whole,
+ title={Whole-Body Human Pose Estimation in the Wild},
+ author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ year={2020}
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [S-ViPNAS-MobileNetV3_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.632 | 0.710 | 0.530 | 0.660 | 0.672 | 0.771 | 0.404 | 0.519 | 0.508 | 0.607 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark_20211205.log.json) |
+| [S-ViPNAS-Res50_dark](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.650 | 0.732 | 0.550 | 0.686 | 0.684 | 0.783 | 0.437 | 0.554 | 0.528 | 0.632 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark_20211112.log.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
new file mode 100644
index 0000000..0f10316
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
@@ -0,0 +1,45 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: &id001
+ - ViPNAS
+ - DarkPose
+ Training Data: COCO-WholeBody
+ Name: td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.632
+ Body AR: 0.71
+ Face AP: 0.672
+ Face AR: 0.771
+ Foot AP: 0.53
+ Foot AR: 0.66
+ Hand AP: 0.404
+ Hand AR: 0.519
+ Whole AP: 0.508
+ Whole AR: 0.607
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_wholebody_256x192_dark-e2158108_20211205.pth
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py
+ In Collection: ViPNAS
+ Metadata:
+ Architecture: *id001
+ Training Data: COCO-WholeBody
+ Name: td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.65
+ Body AR: 0.732
+ Face AP: 0.684
+ Face AR: 0.783
+ Foot AP: 0.55
+ Foot AR: 0.686
+ Hand AP: 0.437
+ Hand AR: 0.554
+ Whole AP: 0.528
+ Whole AR: 0.632
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_wholebody_256x192_dark-67c0ce35_20211112.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml
new file mode 100644
index 0000000..736e0bd
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_coco-wholebody.yml
@@ -0,0 +1,23 @@
+Models:
+- Config: configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py
+ In Collection: HRNet
+ Metadata:
+ Architecture: &id001
+ - HRNet
+ Training Data: UBody-COCO-WholeBody
+ Name: td-hm_hrnet-w32_8xb64-210e_ubody-256x192
+ Results:
+ - Dataset: COCO-WholeBody
+ Metrics:
+ Body AP: 0.678
+ Body AR: 0.755
+ Face AP: 0.630
+ Face AR: 0.708
+ Foot AP: 0.543
+ Foot AR: 0.661
+ Hand AP: 0.467
+ Hand AR: 0.566
+ Whole AP: 0.536
+ Whole AR: 0.636
+ Task: Wholebody 2D Keypoint
+ Weights: https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.pth
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md
new file mode 100644
index 0000000..cb4a4cf
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/hrnet_ubody-coco-wholebody.md
@@ -0,0 +1,38 @@
+
+
+
+HRNet (CVPR'2019)
+
+```bibtex
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+
+
+
+
+
+UBody (CVPR'2023)
+
+```bibtex
+@article{lin2023one,
+ title={One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer},
+ author={Lin, Jing and Zeng, Ailing and Wang, Haoqian and Zhang, Lei and Li, Yu},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ year={2023},
+}
+```
+
+
+
+Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset
+
+| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log |
+| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: |
+| [pose_hrnet_w32](/configs/wholebody_2d_keypoint/topdown_heatmap/ubody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.685 | 0.759 | 0.564 | 0.675 | 0.625 | 0.705 | 0.516 | 0.609 | 0.549 | 0.646 | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.pth) | [log](https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/ubody/td-hm_hrnet-w32_8xb64-210e_ubody-coco-256x192-7c227391_20230807.json) |
diff --git a/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py
new file mode 100644
index 0000000..38aa1a9
--- /dev/null
+++ b/modules/rtmpose/configs/wholebody_2d_keypoint/topdown_heatmap/ubody2d/td-hm_hrnet-w32_8xb64-210e_ubody-256x192.py
@@ -0,0 +1,173 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+# runtime
+train_cfg = dict(max_epochs=210, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(
+ type='Adam',
+ lr=5e-4,
+))
+
+# learning policy
+param_scheduler = [
+ dict(
+ type='LinearLR', begin=0, end=500, start_factor=0.001,
+ by_epoch=False), # warm-up
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=210,
+ milestones=[170, 200],
+ gamma=0.1,
+ by_epoch=True)
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+ checkpoint=dict(save_best='coco-wholebody/AP', rule='greater'))
+
+# codec settings
+codec = dict(
+ type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2)
+
+# model settings
+model = dict(
+ type='TopdownPoseEstimator',
+ data_preprocessor=dict(
+ type='PoseDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True),
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmpose/'
+ 'pretrain_models/hrnet_w32-36af842e.pth'),
+ ),
+ head=dict(
+ type='HeatmapHead',
+ in_channels=32,
+ out_channels=133,
+ deconv_out_channels=None,
+ loss=dict(type='KeypointMSELoss', use_target_weight=True),
+ decoder=codec),
+ test_cfg=dict(
+ flip_test=True,
+ flip_mode='heatmap',
+ shift_heatmap=True,
+ ))
+
+# base dataset settings
+dataset_type = 'UBody2dDataset'
+data_mode = 'topdown'
+data_root = 'data/UBody/'
+
+scenes = [
+ 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+ 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+ 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+
+train_datasets = [
+ dict(
+ type='CocoWholeBodyDataset',
+ data_root='data/coco/',
+ data_mode=data_mode,
+ ann_file='annotations/coco_wholebody_train_v1.0.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[])
+]
+
+for scene in scenes:
+ train_dataset = dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_mode=data_mode,
+ ann_file=f'annotations/{scene}/train_annotations.json',
+ data_prefix=dict(img='images/'),
+ pipeline=[],
+ sample_interval=10)
+ train_datasets.append(train_dataset)
+
+# pipelines
+train_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='RandomFlip', direction='horizontal'),
+ dict(type='RandomHalfBody'),
+ dict(type='RandomBBoxTransform'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='GenerateTarget', encoder=codec),
+ dict(type='PackPoseInputs')
+]
+val_pipeline = [
+ dict(type='LoadImage'),
+ dict(type='GetBBoxCenterScale'),
+ dict(type='TopdownAffine', input_size=codec['input_size']),
+ dict(type='PackPoseInputs')
+]
+
+# data loaders
+train_dataloader = dict(
+ batch_size=64,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='CombinedDataset',
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+ datasets=train_datasets,
+ pipeline=train_pipeline,
+ test_mode=False,
+ ))
+val_dataloader = dict(
+ batch_size=32,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+ dataset=dict(
+ type='CocoWholeBodyDataset',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json',
+ data_prefix=dict(img='data/coco/val2017/'),
+ pipeline=val_pipeline,
+ bbox_file='data/coco/person_detection_results/'
+ 'COCO_val2017_detections_AP_H_56_person.json',
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoWholeBodyMetric',
+ ann_file='data/coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py
new file mode 100644
index 0000000..4e4b8e3
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py
@@ -0,0 +1,270 @@
+# runtime settings
+default_scope = 'mmdet'
+
+default_hooks = dict(
+ timer=dict(type='IterTimerHook'),
+ logger=dict(type='LoggerHook', interval=50),
+ param_scheduler=dict(type='ParamSchedulerHook'),
+ checkpoint=dict(type='CheckpointHook', interval=1),
+ sampler_seed=dict(type='DistSamplerSeedHook'),
+ visualization=dict(type='DetVisualizationHook'))
+
+env_cfg = dict(
+ cudnn_benchmark=False,
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+ dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+ type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# model settings
+model = dict(
+ type='CascadeRCNN',
+ data_preprocessor=dict(
+ type='DetDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True,
+ pad_mask=True,
+ pad_size_divisor=32),
+ backbone=dict(
+ type='ResNeXt',
+ depth=101,
+ groups=64,
+ base_width=4,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ style='pytorch',
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+ roi_head=dict(
+ type='CascadeRoIHead',
+ num_stages=3,
+ stage_loss_weights=[1, 0.5, 0.25],
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=[
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+ ]),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=2000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=[
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.6,
+ min_pos_iou=0.6,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.7,
+ min_pos_iou=0.7,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)
+ ]),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PackDetInputs')
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+ # If you don't have a gt annotation, delete the pipeline
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+train_dataloader = dict(
+ batch_size=2,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ batch_sampler=dict(type='AspectRatioBatchSampler'),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ metric='bbox',
+ format_only=False)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py
new file mode 100644
index 0000000..5b9d43a
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_coco.py
@@ -0,0 +1,256 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[16, 19])
+total_epochs = 20
+
+# model settings
+model = dict(
+ type='CascadeRCNN',
+ pretrained='open-mmlab://resnext101_64x4d',
+ backbone=dict(
+ type='ResNeXt',
+ depth=101,
+ groups=64,
+ base_width=4,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+ roi_head=dict(
+ type='CascadeRoIHead',
+ num_stages=3,
+ stage_loss_weights=[1, 0.5, 0.25],
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=[
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+ ]),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=2000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=[
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.6,
+ min_pos_iou=0.6,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.7,
+ min_pos_iou=0.7,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)
+ ]),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py
new file mode 100644
index 0000000..e1fc35d
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_1class.py
@@ -0,0 +1,182 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[8, 11])
+total_epochs = 12
+
+model = dict(
+ type='FasterRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)
+ # soft-nms is also supported for rcnn testing
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+ ))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py
new file mode 100644
index 0000000..74d5498
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py
@@ -0,0 +1,196 @@
+# runtime settings
+default_scope = 'mmdet'
+
+default_hooks = dict(
+ timer=dict(type='IterTimerHook'),
+ logger=dict(type='LoggerHook', interval=50),
+ param_scheduler=dict(type='ParamSchedulerHook'),
+ checkpoint=dict(type='CheckpointHook', interval=1),
+ sampler_seed=dict(type='DistSamplerSeedHook'),
+ visualization=dict(type='DetVisualizationHook'))
+
+env_cfg = dict(
+ cudnn_benchmark=False,
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+ dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+ type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# model settings
+model = dict(
+ type='FasterRCNN',
+ data_preprocessor=dict(
+ type='DetDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True,
+ pad_size_divisor=32),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)
+ # soft-nms is also supported for rcnn testing
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+ ))
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PackDetInputs')
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+ # If you don't have a gt annotation, delete the pipeline
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+train_dataloader = dict(
+ batch_size=2,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ batch_sampler=dict(type='AspectRatioBatchSampler'),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ metric='bbox',
+ format_only=False)
+test_evaluator = val_evaluator
diff --git a/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py b/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000..226ae25
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/mask_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,242 @@
+model = dict(
+ type='MaskRCNN',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ mask_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(
+ type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type='CocoDataset',
+ ann_file='data/coco/annotations/instances_train2017.json',
+ img_prefix='data/coco/train2017/',
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(
+ type='Collect',
+ keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+ ]),
+ val=dict(
+ type='CocoDataset',
+ ann_file='data/coco/annotations/instances_val2017.json',
+ img_prefix='data/coco/val2017/',
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+ ]),
+ test=dict(
+ type='CocoDataset',
+ ann_file='data/coco/annotations/instances_val2017.json',
+ img_prefix='data/coco/val2017/',
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+ ]))
+evaluation = dict(metric=['bbox', 'segm'])
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[16, 22])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+checkpoint_config = dict(interval=1)
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+custom_hooks = [dict(type='NumClassCheckHook')]
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py b/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py
new file mode 100644
index 0000000..87bc3c8
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/rtmdet_m_640-8xb32_coco-person.py
@@ -0,0 +1,20 @@
+_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py'
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa
+
+model = dict(
+ backbone=dict(
+ init_cfg=dict(
+ type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+ bbox_head=dict(num_classes=1),
+ test_cfg=dict(
+ nms_pre=1000,
+ min_bbox_size=0,
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.6),
+ max_per_img=100))
+
+train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+
+val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+test_dataloader = val_dataloader
diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py b/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py
new file mode 100644
index 0000000..7cca574
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/rtmdet_m_8xb32-300e_coco.py
@@ -0,0 +1 @@
+_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py'
diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py
new file mode 100644
index 0000000..a681da0
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_coco-person.py
@@ -0,0 +1,104 @@
+_base_ = 'mmdet::rtmdet/rtmdet_l_8xb32-300e_coco.py'
+
+input_shape = 320
+
+model = dict(
+ backbone=dict(
+ deepen_factor=0.33,
+ widen_factor=0.25,
+ use_depthwise=True,
+ ),
+ neck=dict(
+ in_channels=[64, 128, 256],
+ out_channels=64,
+ num_csp_blocks=1,
+ use_depthwise=True,
+ ),
+ bbox_head=dict(
+ in_channels=64,
+ feat_channels=64,
+ share_conv=False,
+ exp_on_reg=False,
+ use_depthwise=True,
+ num_classes=1),
+ test_cfg=dict(
+ nms_pre=1000,
+ min_bbox_size=0,
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.6),
+ max_per_img=100))
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='CachedMosaic',
+ img_scale=(input_shape, input_shape),
+ pad_val=114.0,
+ max_cached_images=20,
+ random_pop=False),
+ dict(
+ type='RandomResize',
+ scale=(input_shape * 2, input_shape * 2),
+ ratio_range=(0.5, 1.5),
+ keep_ratio=True),
+ dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='RandomResize',
+ scale=(input_shape, input_shape),
+ ratio_range=(0.5, 1.5),
+ keep_ratio=True),
+ dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+
+train_dataloader = dict(
+ dataset=dict(pipeline=train_pipeline, metainfo=dict(classes=('person', ))))
+
+val_dataloader = dict(
+ dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=('person', ))))
+test_dataloader = val_dataloader
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='PipelineSwitchHook',
+ switch_epoch=280,
+ switch_pipeline=train_pipeline_stage2)
+]
diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py
new file mode 100644
index 0000000..abec3c0
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/rtmdet_nano_320-8xb32_hand.py
@@ -0,0 +1,171 @@
+_base_ = 'mmdet::rtmdet/rtmdet_l_8xb32-300e_coco.py'
+
+input_shape = 320
+
+model = dict(
+ backbone=dict(
+ deepen_factor=0.33,
+ widen_factor=0.25,
+ use_depthwise=True,
+ ),
+ neck=dict(
+ in_channels=[64, 128, 256],
+ out_channels=64,
+ num_csp_blocks=1,
+ use_depthwise=True,
+ ),
+ bbox_head=dict(
+ in_channels=64,
+ feat_channels=64,
+ share_conv=False,
+ exp_on_reg=False,
+ use_depthwise=True,
+ num_classes=1),
+ test_cfg=dict(
+ nms_pre=1000,
+ min_bbox_size=0,
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.6),
+ max_per_img=100))
+
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({'data/': 's3://openmmlab/datasets/'}))
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='CachedMosaic',
+ img_scale=(input_shape, input_shape),
+ pad_val=114.0,
+ max_cached_images=20,
+ random_pop=False),
+ dict(
+ type='RandomResize',
+ scale=(input_shape * 2, input_shape * 2),
+ ratio_range=(0.5, 1.5),
+ keep_ratio=True),
+ dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='RandomResize',
+ scale=(input_shape, input_shape),
+ ratio_range=(0.5, 1.5),
+ keep_ratio=True),
+ dict(type='RandomCrop', crop_size=(input_shape, input_shape)),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True),
+ dict(
+ type='Pad',
+ size=(input_shape, input_shape),
+ pad_val=dict(img=(114, 114, 114))),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+
+data_mode = 'topdown'
+data_root = 'data/'
+
+train_dataset = dict(
+ _delete_=True,
+ type='ConcatDataset',
+ datasets=[
+ dict(
+ type='mmpose.OneHand10KDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ pipeline=train_pipeline,
+ ann_file='onehand10k/annotations/onehand10k_train.json',
+ data_prefix=dict(img='pose/OneHand10K/')),
+ dict(
+ type='mmpose.FreiHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ pipeline=train_pipeline,
+ ann_file='freihand/annotations/freihand_train.json',
+ data_prefix=dict(img='pose/FreiHand/')),
+ dict(
+ type='mmpose.Rhd2DDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ pipeline=train_pipeline,
+ ann_file='rhd/annotations/rhd_train.json',
+ data_prefix=dict(img='pose/RHD/')),
+ dict(
+ type='mmpose.HalpeHandDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ pipeline=train_pipeline,
+ ann_file='halpe/annotations/halpe_train_v1.json',
+ data_prefix=dict(
+ img='pose/Halpe/hico_20160224_det/images/train2015/') # noqa
+ )
+ ],
+ ignore_keys=[
+ 'CLASSES', 'dataset_keypoint_weights', 'dataset_name', 'flip_indices',
+ 'flip_pairs', 'keypoint_colors', 'keypoint_id2name',
+ 'keypoint_name2id', 'lower_body_ids', 'num_keypoints',
+ 'num_skeleton_links', 'sigmas', 'skeleton_link_colors',
+ 'skeleton_links', 'upper_body_ids'
+ ],
+)
+
+test_dataset = dict(
+ _delete_=True,
+ type='mmpose.OneHand10KDataset',
+ data_root=data_root,
+ data_mode=data_mode,
+ pipeline=test_pipeline,
+ ann_file='onehand10k/annotations/onehand10k_test.json',
+ data_prefix=dict(img='pose/OneHand10K/'),
+)
+
+train_dataloader = dict(dataset=train_dataset)
+val_dataloader = dict(dataset=test_dataset)
+test_dataloader = val_dataloader
+
+custom_hooks = [
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0002,
+ update_buffers=True,
+ priority=49),
+ dict(
+ type='PipelineSwitchHook',
+ switch_epoch=280,
+ switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file=data_root + 'onehand10k/annotations/onehand10k_test.json',
+ metric='bbox',
+ format_only=False)
+test_evaluator = val_evaluator
+
+train_cfg = dict(val_interval=1)
diff --git a/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py b/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py
new file mode 100644
index 0000000..d6a5a64
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/rtmdet_tiny_8xb32-300e_coco.py
@@ -0,0 +1 @@
+_base_ = 'mmdet::rtmdet/rtmdet_tiny_8xb32-300e_coco.py'
diff --git a/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py
new file mode 100644
index 0000000..ee463f0
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py
@@ -0,0 +1,136 @@
+# model settings
+data_preprocessor = dict(
+ type='DetDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True,
+ pad_size_divisor=1)
+model = dict(
+ type='SingleStageDetector',
+ data_preprocessor=data_preprocessor,
+ backbone=dict(
+ type='MobileNetV2',
+ out_indices=(4, 7),
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+ neck=dict(
+ type='SSDNeck',
+ in_channels=(96, 1280),
+ out_channels=(96, 1280, 512, 256, 256, 128),
+ level_strides=(2, 2, 2, 2),
+ level_paddings=(1, 1, 1, 1),
+ l2_norm_scale=None,
+ use_depthwise=True,
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ act_cfg=dict(type='ReLU6'),
+ init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+ bbox_head=dict(
+ type='SSDHead',
+ in_channels=(96, 1280, 512, 256, 256, 128),
+ num_classes=80,
+ use_depthwise=True,
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ act_cfg=dict(type='ReLU6'),
+ init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+
+ # set anchor size manually instead of using the predefined
+ # SSD300 setting.
+ anchor_generator=dict(
+ type='SSDAnchorGenerator',
+ scale_major=False,
+ strides=[16, 32, 64, 107, 160, 320],
+ ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+ min_sizes=[48, 100, 150, 202, 253, 304],
+ max_sizes=[100, 150, 202, 253, 304, 320]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[0.1, 0.1, 0.2, 0.2])),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.,
+ ignore_iof_thr=-1,
+ gt_max_assign_all=False),
+ sampler=dict(type='PseudoSampler'),
+ smoothl1_beta=1.,
+ allowed_border=-1,
+ pos_weight=-1,
+ neg_pos_ratio=3,
+ debug=False),
+ test_cfg=dict(
+ nms_pre=1000,
+ nms=dict(type='nms', iou_threshold=0.45),
+ min_bbox_size=0,
+ score_thr=0.02,
+ max_per_img=200))
+env_cfg = dict(cudnn_benchmark=True)
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+input_size = 320
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='Expand',
+ mean=data_preprocessor['mean'],
+ to_rgb=data_preprocessor['bgr_to_rgb'],
+ ratio_range=(1, 4)),
+ dict(
+ type='MinIoURandomCrop',
+ min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+ min_crop_size=0.3),
+ dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+ dict(type='RandomFlip', prob=0.5),
+ dict(
+ type='PhotoMetricDistortion',
+ brightness_delta=32,
+ contrast_range=(0.5, 1.5),
+ saturation_range=(0.5, 1.5),
+ hue_delta=18),
+ dict(type='PackDetInputs')
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+train_dataloader = dict(
+ batch_size=24,
+ num_workers=4,
+ batch_sampler=None,
+ dataset=dict(
+ _delete_=True,
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
+ pipeline=train_pipeline)))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/instances_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py
new file mode 100644
index 0000000..1448916
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/ssdlite_mobilenetv2_scratch_600e_onehand.py
@@ -0,0 +1,153 @@
+# =========================================================
+# from 'mmdetection/configs/_base_/default_runtime.py'
+# =========================================================
+default_scope = 'mmdet'
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+# =========================================================
+
+# model settings
+data_preprocessor = dict(
+ type='DetDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ bgr_to_rgb=True,
+ pad_size_divisor=1)
+model = dict(
+ type='SingleStageDetector',
+ data_preprocessor=data_preprocessor,
+ backbone=dict(
+ type='MobileNetV2',
+ out_indices=(4, 7),
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+ neck=dict(
+ type='SSDNeck',
+ in_channels=(96, 1280),
+ out_channels=(96, 1280, 512, 256, 256, 128),
+ level_strides=(2, 2, 2, 2),
+ level_paddings=(1, 1, 1, 1),
+ l2_norm_scale=None,
+ use_depthwise=True,
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ act_cfg=dict(type='ReLU6'),
+ init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+ bbox_head=dict(
+ type='SSDHead',
+ in_channels=(96, 1280, 512, 256, 256, 128),
+ num_classes=1,
+ use_depthwise=True,
+ norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+ act_cfg=dict(type='ReLU6'),
+ init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+
+ # set anchor size manually instead of using the predefined
+ # SSD300 setting.
+ anchor_generator=dict(
+ type='SSDAnchorGenerator',
+ scale_major=False,
+ strides=[16, 32, 64, 107, 160, 320],
+ ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+ min_sizes=[48, 100, 150, 202, 253, 304],
+ max_sizes=[100, 150, 202, 253, 304, 320]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[0.1, 0.1, 0.2, 0.2])),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.,
+ ignore_iof_thr=-1,
+ gt_max_assign_all=False),
+ sampler=dict(type='PseudoSampler'),
+ smoothl1_beta=1.,
+ allowed_border=-1,
+ pos_weight=-1,
+ neg_pos_ratio=3,
+ debug=False),
+ test_cfg=dict(
+ nms_pre=1000,
+ nms=dict(type='nms', iou_threshold=0.45),
+ min_bbox_size=0,
+ score_thr=0.02,
+ max_per_img=200))
+cudnn_benchmark = True
+
+# dataset settings
+file_client_args = dict(backend='disk')
+
+dataset_type = 'CocoDataset'
+data_root = 'data/onehand10k/'
+classes = ('hand', )
+input_size = 320
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file='annotations/onehand10k_test.json',
+ test_mode=True,
+ pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ min_lr=0)
+runner = dict(type='EpochBasedRunner', max_epochs=120)
+
+# Avoid evaluation and saving weights too frequently
+evaluation = dict(interval=5, metric='bbox')
+checkpoint_config = dict(interval=5)
+custom_hooks = [
+ dict(type='NumClassCheckHook'),
+ dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+log_config = dict(interval=5)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (24 samples per GPU)
+auto_scale_lr = dict(base_batch_size=192)
+
+load_from = 'https://download.openmmlab.com/mmdetection/'
+'v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/'
+'ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth'
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+ type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py b/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py
new file mode 100644
index 0000000..2d3efab
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/yolov3_d53_320_273e_coco.py
@@ -0,0 +1,140 @@
+# model settings
+model = dict(
+ type='YOLOV3',
+ pretrained='open-mmlab://darknet53',
+ backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
+ neck=dict(
+ type='YOLOV3Neck',
+ num_scales=3,
+ in_channels=[1024, 512, 256],
+ out_channels=[512, 256, 128]),
+ bbox_head=dict(
+ type='YOLOV3Head',
+ num_classes=80,
+ in_channels=[512, 256, 128],
+ out_channels=[1024, 512, 256],
+ anchor_generator=dict(
+ type='YOLOAnchorGenerator',
+ base_sizes=[[(116, 90), (156, 198), (373, 326)],
+ [(30, 61), (62, 45), (59, 119)],
+ [(10, 13), (16, 30), (33, 23)]],
+ strides=[32, 16, 8]),
+ bbox_coder=dict(type='YOLOBBoxCoder'),
+ featmap_strides=[32, 16, 8],
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ loss_weight=1.0,
+ reduction='sum'),
+ loss_conf=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ loss_weight=1.0,
+ reduction='sum'),
+ loss_xy=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ loss_weight=2.0,
+ reduction='sum'),
+ loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+ # training and testing settings
+ train_cfg=dict(
+ assigner=dict(
+ type='GridAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0)),
+ test_cfg=dict(
+ nms_pre=1000,
+ min_bbox_size=0,
+ score_thr=0.05,
+ conf_thr=0.005,
+ nms=dict(type='nms', iou_threshold=0.45),
+ max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile', to_float32=True),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(type='PhotoMetricDistortion'),
+ dict(
+ type='Expand',
+ mean=img_norm_cfg['mean'],
+ to_rgb=img_norm_cfg['to_rgb'],
+ ratio_range=(1, 2)),
+ dict(
+ type='MinIoURandomCrop',
+ min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+ min_crop_size=0.3),
+ dict(type='Resize', img_scale=(320, 320), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(320, 320),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=f'{data_root}/annotations/instances_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000, # same as burn-in in darknet
+ warmup_ratio=0.1,
+ step=[218, 246])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=273)
+evaluation = dict(interval=1, metric=['bbox'])
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py b/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py
new file mode 100644
index 0000000..225d2c6
--- /dev/null
+++ b/modules/rtmpose/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py
@@ -0,0 +1,300 @@
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300, val_interval=10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+param_scheduler = [
+ dict(
+ type='mmdet.QuadraticWarmupLR',
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0.0005,
+ begin=5,
+ T_max=285,
+ end=285,
+ by_epoch=True,
+ convert_to_iter_based=True),
+ dict(type='ConstantLR', by_epoch=True, factor=1, begin=285, end=300)
+]
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(
+ type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0))
+auto_scale_lr = dict(enable=False, base_batch_size=64)
+default_scope = 'mmdet'
+default_hooks = dict(
+ timer=dict(type='IterTimerHook'),
+ logger=dict(type='LoggerHook', interval=50),
+ param_scheduler=dict(type='ParamSchedulerHook'),
+ checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3),
+ sampler_seed=dict(type='DistSamplerSeedHook'),
+ visualization=dict(type='DetVisualizationHook'))
+env_cfg = dict(
+ cudnn_benchmark=False,
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+ dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+ type='DetLocalVisualizer',
+ vis_backends=[dict(type='LocalVisBackend')],
+ name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'
+load_from = 'https://download.openmmlab.com/mmdetection/' \
+ 'v2.0/yolox/yolox_s_8x8_300e_coco/' \
+ 'yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth'
+resume = False
+img_scale = (640, 640)
+model = dict(
+ type='YOLOX',
+ data_preprocessor=dict(
+ type='DetDataPreprocessor',
+ pad_size_divisor=32,
+ batch_augments=[
+ dict(
+ type='BatchSyncRandomResize',
+ random_size_range=(480, 800),
+ size_divisor=32,
+ interval=10)
+ ]),
+ backbone=dict(
+ type='CSPDarknet',
+ deepen_factor=0.33,
+ widen_factor=0.5,
+ out_indices=(2, 3, 4),
+ use_depthwise=False,
+ spp_kernal_sizes=(5, 9, 13),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ neck=dict(
+ type='YOLOXPAFPN',
+ in_channels=[128, 256, 512],
+ out_channels=128,
+ num_csp_blocks=1,
+ use_depthwise=False,
+ upsample_cfg=dict(scale_factor=2, mode='nearest'),
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish')),
+ bbox_head=dict(
+ type='YOLOXHead',
+ num_classes=1,
+ in_channels=128,
+ feat_channels=128,
+ stacked_convs=2,
+ strides=(8, 16, 32),
+ use_depthwise=False,
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+ act_cfg=dict(type='Swish'),
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0),
+ loss_bbox=dict(
+ type='IoULoss',
+ mode='square',
+ eps=1e-16,
+ reduction='sum',
+ loss_weight=5.0),
+ loss_obj=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0),
+ loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
+ train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+ test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+backend_args = dict(backend='local')
+train_pipeline = [
+ dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0),
+ dict(
+ type='RandomAffine', scaling_ratio_range=(0.1, 2),
+ border=(-320, -320)),
+ dict(
+ type='MixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+ dict(type='PackDetInputs')
+]
+train_dataset = dict(
+ type='MultiImageMixDataset',
+ dataset=dict(
+ type='CocoDataset',
+ data_root='data/coco/',
+ ann_file='annotations/instances_train2017.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[
+ dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+ dict(type='LoadAnnotations', with_bbox=True)
+ ],
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
+ pipeline=[
+ dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0),
+ dict(
+ type='RandomAffine',
+ scaling_ratio_range=(0.1, 2),
+ border=(-320, -320)),
+ dict(
+ type='MixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(
+ type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+ dict(type='PackDetInputs')
+ ])
+test_pipeline = [
+ dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=4,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='MultiImageMixDataset',
+ dataset=dict(
+ type='CocoDataset',
+ data_root='data/coco/',
+ ann_file='annotations/coco_face_train.json',
+ data_prefix=dict(img='train2017/'),
+ pipeline=[
+ dict(
+ type='LoadImageFromFile',
+ backend_args=dict(backend='local')),
+ dict(type='LoadAnnotations', with_bbox=True)
+ ],
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
+ metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60))),
+ pipeline=[
+ dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0),
+ dict(
+ type='RandomAffine',
+ scaling_ratio_range=(0.1, 2),
+ border=(-320, -320)),
+ dict(
+ type='MixUp',
+ img_scale=(640, 640),
+ ratio_range=(0.8, 1.6),
+ pad_val=114.0),
+ dict(type='YOLOXHSVRandomAug'),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(
+ type='FilterAnnotations',
+ min_gt_bbox_wh=(1, 1),
+ keep_empty=False),
+ dict(type='PackDetInputs')
+ ]))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root='data/coco/',
+ ann_file='annotations/coco_face_val.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=[
+ dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+ ],
+ metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60))))
+test_dataloader = dict(
+ batch_size=8,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root='data/coco/',
+ ann_file='annotations/coco_face_val.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=[
+ dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+ dict(type='Resize', scale=(640, 640), keep_ratio=True),
+ dict(
+ type='Pad',
+ pad_to_square=True,
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+ ],
+ metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60))))
+val_evaluator = dict(
+ type='CocoMetric',
+ ann_file='data/coco/annotations/coco_face_val.json',
+ metric='bbox')
+test_evaluator = dict(
+ type='CocoMetric',
+ ann_file='data/coco/annotations/instances_val2017.json',
+ metric='bbox')
+max_epochs = 300
+num_last_epochs = 15
+interval = 10
+base_lr = 0.01
+custom_hooks = [
+ dict(type='YOLOXModeSwitchHook', num_last_epochs=15, priority=48),
+ dict(type='SyncNormHook', priority=48),
+ dict(
+ type='EMAHook',
+ ema_type='ExpMomentumEMA',
+ momentum=0.0001,
+ strict_load=False,
+ update_buffers=True,
+ priority=49)
+]
+metainfo = dict(CLASSES=('person', ), PALETTE=(220, 20, 60))
+launcher = 'pytorch'
diff --git a/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py b/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
new file mode 100644
index 0000000..3dd5129
--- /dev/null
+++ b/modules/rtmpose/mmtracking_cfg/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
@@ -0,0 +1,321 @@
+model = dict(
+ detector=dict(
+ type='FasterRCNN',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch',
+ init_cfg=dict(
+ type='Pretrained', checkpoint='torchvision://resnet50')),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[1.0, 1.0, 1.0, 1.0],
+ clip_border=False),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(
+ type='SmoothL1Loss', beta=0.1111111111111111,
+ loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(
+ type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[0.1, 0.1, 0.2, 0.2],
+ clip_border=False),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmtracking/'
+ 'mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth')),
+ type='DeepSORT',
+ motion=dict(type='KalmanFilter', center_only=False),
+ reid=dict(
+ type='BaseReID',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3, ),
+ style='pytorch'),
+ neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+ head=dict(
+ type='LinearReIDHead',
+ num_fcs=1,
+ in_channels=2048,
+ fc_channels=1024,
+ out_channels=128,
+ num_classes=380,
+ loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+ loss_pairwise=dict(
+ type='TripletLoss', margin=0.3, loss_weight=1.0),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU')),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='https://download.openmmlab.com/mmtracking/'
+ 'mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth')),
+ tracker=dict(
+ type='SortTracker',
+ obj_score_thr=0.5,
+ reid=dict(
+ num_samples=10,
+ img_scale=(256, 128),
+ img_norm_cfg=None,
+ match_score_thr=2.0),
+ match_iou_thr=0.5,
+ momentums=None,
+ num_tentatives=2,
+ num_frames_retain=100))
+dataset_type = 'MOTChallengeDataset'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadMultiImagesFromFile', to_float32=True),
+ dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+ dict(
+ type='SeqResize',
+ img_scale=(1088, 1088),
+ share_params=True,
+ ratio_range=(0.8, 1.2),
+ keep_ratio=True,
+ bbox_clip_border=False),
+ dict(type='SeqPhotoMetricDistortion', share_params=True),
+ dict(
+ type='SeqRandomCrop',
+ share_params=False,
+ crop_size=(1088, 1088),
+ bbox_clip_border=False),
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+ dict(
+ type='SeqNormalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='SeqPad', size_divisor=32),
+ dict(type='MatchInstances', skip_nomatch=True),
+ dict(
+ type='VideoCollect',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+ 'gt_instance_ids'
+ ]),
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+]
+data_root = 'data/MOT17/'
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type='MOTChallengeDataset',
+ visibility_thr=-1,
+ ann_file='data/MOT17/annotations/half-train_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=dict(
+ num_ref_imgs=1,
+ frame_range=10,
+ filter_key_img=True,
+ method='uniform'),
+ pipeline=[
+ dict(type='LoadMultiImagesFromFile', to_float32=True),
+ dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+ dict(
+ type='SeqResize',
+ img_scale=(1088, 1088),
+ share_params=True,
+ ratio_range=(0.8, 1.2),
+ keep_ratio=True,
+ bbox_clip_border=False),
+ dict(type='SeqPhotoMetricDistortion', share_params=True),
+ dict(
+ type='SeqRandomCrop',
+ share_params=False,
+ crop_size=(1088, 1088),
+ bbox_clip_border=False),
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+ dict(
+ type='SeqNormalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='SeqPad', size_divisor=32),
+ dict(type='MatchInstances', skip_nomatch=True),
+ dict(
+ type='VideoCollect',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+ 'gt_instance_ids'
+ ]),
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+ ]),
+ val=dict(
+ type='MOTChallengeDataset',
+ ann_file='data/MOT17/annotations/half-val_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=None,
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+ ]),
+ test=dict(
+ type='MOTChallengeDataset',
+ ann_file='data/MOT17/annotations/half-val_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=None,
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+ ]))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+checkpoint_config = dict(interval=1)
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=100,
+ warmup_ratio=0.01,
+ step=[3])
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
+search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
diff --git a/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py b/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py
new file mode 100644
index 0000000..db94427
--- /dev/null
+++ b/modules/rtmpose/mmtracking_cfg/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py
@@ -0,0 +1,325 @@
+model = dict(
+ detector=dict(
+ type='FasterRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[1.0, 1.0, 1.0, 1.0],
+ clip_border=False),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(
+ type='SmoothL1Loss', beta=0.1111111111111111,
+ loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(
+ type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[0.1, 0.1, 0.2, 0.2],
+ clip_border=False),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100))),
+ type='Tracktor',
+ pretrains=dict(
+ detector='https://download.openmmlab.com/mmtracking/'
+ 'mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth',
+ reid='https://download.openmmlab.com/mmtracking/mot/'
+ 'reid/reid_r50_6e_mot17-4bf6b63d.pth'),
+ reid=dict(
+ type='BaseReID',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3, ),
+ style='pytorch'),
+ neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+ head=dict(
+ type='LinearReIDHead',
+ num_fcs=1,
+ in_channels=2048,
+ fc_channels=1024,
+ out_channels=128,
+ num_classes=378,
+ loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+ loss_pairwise=dict(
+ type='TripletLoss', margin=0.3, loss_weight=1.0),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU'))),
+ motion=dict(
+ type='CameraMotionCompensation',
+ warp_mode='cv2.MOTION_EUCLIDEAN',
+ num_iters=100,
+ stop_eps=1e-05),
+ tracker=dict(
+ type='TracktorTracker',
+ obj_score_thr=0.5,
+ regression=dict(
+ obj_score_thr=0.5,
+ nms=dict(type='nms', iou_threshold=0.6),
+ match_iou_thr=0.3),
+ reid=dict(
+ num_samples=10,
+ img_scale=(256, 128),
+ img_norm_cfg=None,
+ match_score_thr=2.0,
+ match_iou_thr=0.2),
+ momentums=None,
+ num_frames_retain=10))
+dataset_type = 'MOTChallengeDataset'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadMultiImagesFromFile', to_float32=True),
+ dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+ dict(
+ type='SeqResize',
+ img_scale=(1088, 1088),
+ share_params=True,
+ ratio_range=(0.8, 1.2),
+ keep_ratio=True,
+ bbox_clip_border=False),
+ dict(type='SeqPhotoMetricDistortion', share_params=True),
+ dict(
+ type='SeqRandomCrop',
+ share_params=False,
+ crop_size=(1088, 1088),
+ bbox_clip_border=False),
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+ dict(
+ type='SeqNormalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='SeqPad', size_divisor=32),
+ dict(type='MatchInstances', skip_nomatch=True),
+ dict(
+ type='VideoCollect',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+ 'gt_instance_ids'
+ ]),
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+]
+data_root = 'data/MOT17/'
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type='MOTChallengeDataset',
+ visibility_thr=-1,
+ ann_file='data/MOT17/annotations/train_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=dict(
+ num_ref_imgs=1,
+ frame_range=10,
+ filter_key_img=True,
+ method='uniform'),
+ pipeline=[
+ dict(type='LoadMultiImagesFromFile', to_float32=True),
+ dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
+ dict(
+ type='SeqResize',
+ img_scale=(1088, 1088),
+ share_params=True,
+ ratio_range=(0.8, 1.2),
+ keep_ratio=True,
+ bbox_clip_border=False),
+ dict(type='SeqPhotoMetricDistortion', share_params=True),
+ dict(
+ type='SeqRandomCrop',
+ share_params=False,
+ crop_size=(1088, 1088),
+ bbox_clip_border=False),
+ dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
+ dict(
+ type='SeqNormalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='SeqPad', size_divisor=32),
+ dict(type='MatchInstances', skip_nomatch=True),
+ dict(
+ type='VideoCollect',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
+ 'gt_instance_ids'
+ ]),
+ dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+ ]),
+ val=dict(
+ type='MOTChallengeDataset',
+ ann_file='data/MOT17/annotations/train_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=None,
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+ ]),
+ test=dict(
+ type='MOTChallengeDataset',
+ ann_file='data/MOT17/annotations/train_cocoformat.json',
+ img_prefix='data/MOT17/train',
+ ref_img_sampler=None,
+ pipeline=[
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1088, 1088),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(
+ type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='VideoCollect', keys=['img'])
+ ])
+ ]))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+checkpoint_config = dict(interval=1)
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=100,
+ warmup_ratio=0.01,
+ step=[3])
+total_epochs = 4
+evaluation = dict(metric=['bbox', 'track'], interval=1)
+search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
+test_set = 'train'
diff --git a/requirements.txt b/requirements.txt
index 1d9772d..6437e1d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
python-dotenv == 1.0.1
opencv-python == 4.10.0.84
-torch == 2.4.0
-torchvision == 0.19.0
requests == 2.32.3
pandas == 2.2.2
joblib == 1.4.2
-lightgbm == 4.5.0
\ No newline at end of file
+lightgbm == 4.5.0
+xgboost == 2.1.1
+scipy == 1.9.3
+numpy == 1.24.0
\ No newline at end of file