AdaptiveMotorControlLab · xiu-cs · Feb 10, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,8 @@ htmlcov/
 *.pkl
 *.h5
 *.ckpt
+
+# Excluded directories
+pre_trained_models/
+demo/predictions/
+demo/images/
diff --git a/README.md b/README.md
@@ -2,11 +2,11 @@
 
 ![Version](https://img.shields.io/badge/python_version-3.10-purple)
 [![PyPI version](https://badge.fury.io/py/fmpose3d.svg?icon=si%3Apython)](https://badge.fury.io/py/fmpose3d)
-[![License: LApache 2.0](https://img.shields.io/badge/License-Apache2.0-blue.svg)](https://www.gnu.org/licenses/apach2.0)
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0)
 
 This is the official implementation of the approach described in the preprint:
 
-[**FMPose3D: monocular 3D pose estimation via flow matching**](http://arxiv.org/abs/2602.05755)            
+[**FMPose3D: monocular 3D pose estimation via flow matching**](https://arxiv.org/abs/2602.05755)            
 Ti Wang, Xiaohang Yu, Mackenzie Weygandt Mathis
 
 <!-- <p align="center"><img src="./images/Frame 4.jpg" width="50%" alt="" /></p> -->
@@ -51,7 +51,7 @@ sh vis_in_the_wild.sh
 ```
 The predictions will be saved to folder `demo/predictions`.
 
-<p align="center"><img src="./images/demo.jpg" width="95%" alt="" /></p>
+<p align="center"><img src="./images/demo.gif" width="95%" alt="" /></p>
 
 ## Training and Inference
 
@@ -79,7 +79,7 @@ The training logs, checkpoints, and related files of each training time will be
 
 For training on Human3.6M:
 ```bash
-sh /scripts/FMPose3D_train.sh
+sh ./scripts/FMPose3D_train.sh
 ```
 
 ### Inference

diff --git a/animals/demo/vis_animals.py b/animals/demo/vis_animals.py
@@ -8,7 +8,6 @@
 """
 
 # SuperAnimal Demo: https://github.com/DeepLabCut/DeepLabCut/blob/main/examples/COLAB/COLAB_YOURDATA_SuperAnimal.ipynb
-import sys
 import os
 import numpy as np
 import glob
@@ -25,8 +24,6 @@
 from fmpose3d.animals.common.arguments import opts as parse_args
 from fmpose3d.common.camera import normalize_screen_coordinates, camera_to_world
 
-sys.path.append(os.getcwd())
-
 args = parse_args().parse()
 os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 
@@ -334,13 +331,15 @@ def get_pose3D(path, output_dir, type='image'):
     print(f"args.n_joints: {args.n_joints}, args.out_joints: {args.out_joints}")
 
     ## Reload model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
 
     model_dict = model['CFM'].state_dict()
     model_path = args.saved_model_path
     print(f"Loading model from: {model_path}")
-    pre_dict = torch.load(model_path)
+    pre_dict = torch.load(model_path, map_location=device, weights_only=True)
     for name, key in model_dict.items():
         model_dict[name] = pre_dict[name]
     model['CFM'].load_state_dict(model_dict)
@@ -400,7 +399,8 @@ def get_3D_pose_from_image(args, keypoints, i, img, model, output_dir):
         input_2D = np.expand_dims(input_2D, axis=0)  # (1, J, 2)
 
     # Convert to tensor format matching visualize_animal_poses.py
-    input_2D = torch.from_numpy(input_2D.astype('float32')).cuda()  # (1, J, 2)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_2D = torch.from_numpy(input_2D.astype('float32')).to(device)  # (1, J, 2)
     input_2D = input_2D.unsqueeze(0)  # (1, 1, J, 2)
 
     # Euler sampler for CFM
@@ -418,7 +418,7 @@ def euler_sample(c_2d, y_local, steps, model_3d):
 
     # Single inference without flip augmentation
     # Create 3D random noise with shape (1, 1, J, 3)
-    y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3).cuda()
+    y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3, device=device)
     output_3D = euler_sample(input_2D, y, steps=args.sample_steps, model_3d=model)
 
     output_3D = output_3D[0:, args.pad].unsqueeze(1)

diff --git a/animals/scripts/main_animal3d.py b/animals/scripts/main_animal3d.py
@@ -75,7 +75,7 @@ def step(split, args, actions, dataLoader, model, optimizer=None, epoch=None, st
         #  gt_3D shape: torch.Size([B, J, 4]) (x,y,z + homogeneous coordinate)
         gt_3D = gt_3D[:,:,:3]  # only use x,y,z for 3D ground truth
 
-        # [input_2D, gt_3D, batch_cam, vis_3D] = get_varialbe(split, [input_2D, gt_3D, batch_cam, vis_3D])
+        # [input_2D, gt_3D, batch_cam, vis_3D] = get_variable(split, [input_2D, gt_3D, batch_cam, vis_3D])
 
         # unsqueeze frame dimension
         input_2D = input_2D.unsqueeze(1)  # (B,F,J,C)
@@ -264,15 +264,17 @@ def get_parameter_number(net):
         test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size,
                                                       shuffle=False, num_workers=int(args.workers), pin_memory=True)    
 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
 
     if args.reload:
         model_dict = model['CFM'].state_dict()
         # Prefer explicit saved_model_path; otherwise fallback to previous_dir glob
         model_path = args.saved_model_path
         print(model_path)
-        pre_dict = torch.load(model_path)
+        pre_dict = torch.load(model_path, weights_only=True, map_location=device)
         for name, key in model_dict.items():
             model_dict[name] = pre_dict[name]
         model['CFM'].load_state_dict(model_dict)

diff --git a/demo/vis_in_the_wild.py b/demo/vis_in_the_wild.py
@@ -7,7 +7,6 @@
 Licensed under Apache 2.0
 """
 
-import sys
 import cv2
 import os 
 import numpy as np
@@ -16,8 +15,6 @@
 from tqdm import tqdm
 import copy
 
-sys.path.append(os.getcwd())
-
 # Auto-download checkpoint files if missing
 from fmpose3d.lib.checkpoint.download_checkpoints import ensure_checkpoints
 ensure_checkpoints()
@@ -28,17 +25,10 @@
 
 args = parse_args().parse()
 os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
-if getattr(args, 'model_path', ''):
-    import importlib.util
-    import pathlib
-    model_abspath = os.path.abspath(args.model_path)
-    module_name = pathlib.Path(model_abspath).stem
-    spec = importlib.util.spec_from_file_location(module_name, model_abspath)
-    module = importlib.util.module_from_spec(spec)
-    assert spec.loader is not None
-    spec.loader.exec_module(module)
-    CFM = getattr(module, 'Model')
-
+
+from fmpose3d.models import get_model
+CFM = get_model(args.model_type)
+
 from fmpose3d.common.camera import *
 
 import matplotlib
@@ -50,15 +40,27 @@
 matplotlib.rcParams['pdf.fonttype'] = 42
 matplotlib.rcParams['ps.fonttype'] = 42
 
-def show2Dpose(kps, img):
-    connections = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5],
-                   [5, 6], [0, 7], [7, 8], [8, 9], [9, 10],
-                   [8, 11], [11, 12], [12, 13], [8, 14], [14, 15], [15, 16]]
+# Shared skeleton definition so 2D/3D segment colors match
+SKELETON_CONNECTIONS = [
+    [0, 1], [1, 2], [2, 3], [0, 4], [4, 5],
+    [5, 6], [0, 7], [7, 8], [8, 9], [9, 10],
+    [8, 11], [11, 12], [12, 13], [8, 14], [14, 15], [15, 16]
+]
+# LR mask for skeleton segments: True -> left color, False -> right color
+SKELETON_LR = np.array(
+    [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+    dtype=bool,
+)
 
-    LR = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=bool)
+def show2Dpose(kps, img):
+    connections = SKELETON_CONNECTIONS
+    LR = SKELETON_LR
 
     lcolor = (255, 0, 0)
     rcolor = (0, 0, 255)
+    # lcolor = (240, 176, 0)
+    # rcolor = (240, 176, 0)
+
     thickness = 3
 
     for j,c in enumerate(connections):
@@ -67,8 +69,8 @@ def show2Dpose(kps, img):
         start = list(start)
         end = list(end)
         cv2.line(img, (start[0], start[1]), (end[0], end[1]), lcolor if LR[j] else rcolor, thickness)
-        cv2.circle(img, (start[0], start[1]), thickness=-1, color=(0, 255, 0), radius=3)
-        cv2.circle(img, (end[0], end[1]), thickness=-1, color=(0, 255, 0), radius=3)
+        # cv2.circle(img, (start[0], start[1]), thickness=-1, color=(0, 255, 0), radius=3)
+        # cv2.circle(img, (end[0], end[1]), thickness=-1, color=(0, 255, 0), radius=3)
 
     return img
 
@@ -77,11 +79,13 @@ def show3Dpose(vals, ax):
 
     lcolor=(0,0,1)
     rcolor=(1,0,0)
-
-    I = np.array( [0, 0, 1, 4, 2, 5, 0, 7,  8,  8, 14, 15, 11, 12, 8,  9])
-    J = np.array( [1, 4, 2, 5, 3, 6, 7, 8, 14, 11, 15, 16, 12, 13, 9, 10])
-
-    LR = np.array([0, 1, 0, 1, 0, 1, 0, 0, 0,   1,  0,  0,  1,  1, 0, 0], dtype=bool)
+    # lcolor=(0/255, 176/255, 240/255)
+    # rcolor=(0/255, 176/255, 240/255)
+
+
+    I = np.array([c[0] for c in SKELETON_CONNECTIONS])
+    J = np.array([c[1] for c in SKELETON_CONNECTIONS])
+    LR = SKELETON_LR
 
     for i in np.arange( len(I) ):
         x, y, z = [np.array( [vals[I[i], j], vals[J[i], j]] ) for j in range(3)]
@@ -199,7 +203,8 @@ def get_3D_pose_from_image(args, keypoints, i, img, model, output_dir):
 
     input_2D = input_2D[np.newaxis, :, :, :, :]
 
-    input_2D = torch.from_numpy(input_2D.astype('float32')).cuda()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_2D = torch.from_numpy(input_2D.astype('float32')).to(device)
 
     N = input_2D.size(0)
 
@@ -215,10 +220,10 @@ def euler_sample(c_2d, y_local, steps, model_3d):
 
     ## estimation
 
-    y = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3).cuda()
+    y = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3, device=device)
     output_3D_non_flip = euler_sample(input_2D[:, 0], y, steps=args.sample_steps, model_3d=model)
 
-    y_flip = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3).cuda()
+    y_flip = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3, device=device)
     output_3D_flip = euler_sample(input_2D[:, 1], y_flip, steps=args.sample_steps, model_3d=model)
 
     output_3D_flip[:, :, :, 0] *= -1
@@ -266,14 +271,16 @@ def get_pose3D(path, output_dir, type='image'):
     # args.type = type 
 
     ## Reload 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
 
     # if args.reload:
     model_dict = model['CFM'].state_dict()
-    model_path = args.saved_model_path
+    model_path = args.model_weights_path
     print(model_path)
-    pre_dict = torch.load(model_path)
+    pre_dict = torch.load(model_path, map_location=device, weights_only=True)
     for name, key in model_dict.items():
         model_dict[name] = pre_dict[name]
     model['CFM'].load_state_dict(model_dict)
@@ -336,7 +343,7 @@ def get_pose3D(path, output_dir, type='image'):
         ## save
         output_dir_pose = output_dir +'pose/'
         os.makedirs(output_dir_pose, exist_ok=True)
-        plt.savefig(output_dir_pose + str(('%04d'% i)) + '_pose.jpg', dpi=200, bbox_inches = 'tight')
+        plt.savefig(output_dir_pose + str(('%04d'% i)) + '_pose.png', dpi=200, bbox_inches = 'tight')
 
 
 if __name__ == "__main__":

diff --git a/demo/vis_in_the_wild.sh b/demo/vis_in_the_wild.sh
@@ -1,21 +1,22 @@
 #Test
 layers=5
-gpu_id=1
+gpu_id=0
 sample_steps=3
 batch_size=1
 sh_file='vis_in_the_wild.sh'
 
-model_path='../pre_trained_models/fmpose_detected2d/model_GAMLP.py'
-saved_model_path='../pre_trained_models/fmpose_detected2d/FMpose_36_4972_best.pth'
+model_type='fmpose3d'
+model_weights_path='../pre_trained_models/fmpose3d_h36m/FMpose3D_pretrained_weights.pth'
 
-# path='./images/image_00068.jpg'  # single image
-input_images_folder='./images/'  # folder containing multiple images
+target_path='./images/'  # folder containing multiple images
+# target_path='./images/xx.png'  # single image
+# target_path='./videos/xxx.mp4' # video path
 
 python3 vis_in_the_wild.py \
  --type 'image' \
- --path ${input_images_folder} \
- --saved_model_path "${saved_model_path}" \
- --model_path "${model_path}" \
+ --path ${target_path} \
+ --model_weights_path "${model_weights_path}" \
+ --model_type "${model_type}" \
  --sample_steps ${sample_steps} \
  --batch_size ${batch_size} \
  --layers ${layers} \

diff --git a/fmpose3d/__init__.py b/fmpose3d/__init__.py
@@ -18,17 +18,49 @@
     aggregation_RPEA_joint_level,
 )
 
+# Configuration dataclasses
+from .common.config import (
+    FMPose3DConfig,
+    HRNetConfig,
+    InferenceConfig,
+    ModelConfig,
+    PipelineConfig,
+)
+
+# High-level inference API
+from .fmpose3d import (
+    FMPose3DInference,
+    HRNetEstimator,
+    Pose2DResult,
+    Pose3DResult,
+    Source,
+)
+
 # Import 2D pose detection utilities
 from .lib.hrnet.gen_kpts import gen_video_kpts
+from .lib.hrnet.hrnet import HRNetPose2d
 from .lib.preprocess import h36m_coco_format, revise_kpts
 
 # Make commonly used classes/functions available at package level
 __all__ = [
+    # Inference API
+    "FMPose3DInference",
+    "HRNetEstimator",
+    "Pose2DResult",
+    "Pose3DResult",
+    "Source",
+    # Configuration
+    "FMPose3DConfig",
+    "HRNetConfig",
+    "InferenceConfig",
+    "ModelConfig",
+    "PipelineConfig",
     # Aggregation methods
     "average_aggregation",
     "aggregation_select_single_best_hypothesis_by_2D_error",
     "aggregation_RPEA_joint_level",
     # 2D pose detection
+    "HRNetPose2d",
     "gen_video_kpts",
     "h36m_coco_format",
     "revise_kpts",

diff --git a/fmpose3d/aggregation_methods.py b/fmpose3d/aggregation_methods.py
@@ -166,17 +166,13 @@ def aggregation_RPEA_joint_level(
     dist[:, :, 0] = 0.0
 
     # Convert 2D losses to weights using softmax over top-k hypotheses per joint
-    tau = float(getattr(args, "weight_softmax_tau", 1.0))
     H = dist.size(1)
     k = int(getattr(args, "topk", None))
-    # print("k:", k)
-    # k = int(H//2)+1
     k = max(1, min(k, H))
 
     # top-k smallest distances along hypothesis dim
     topk_vals, topk_idx = torch.topk(dist, k=k, dim=1, largest=False)  # (B,k,J)
 
-    # Weight calculation method ; weight_method = 'exp'
     temp = args.exp_temp
     max_safe_val = temp * 20
     topk_vals_clipped = torch.clamp(topk_vals, max=max_safe_val)

diff --git a/fmpose3d/animals/common/arber_dataset.py b/fmpose3d/animals/common/arber_dataset.py
@@ -12,7 +12,6 @@
 import glob
 import os
 import random
-import sys
 
 import cv2
 import matplotlib.pyplot as plt
@@ -23,10 +22,8 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from common.camera import normalize_screen_coordinates
-from common.lifter3d import load_camera_params, load_h5_keypoints
+from fmpose3d.common.camera import normalize_screen_coordinates
+from fmpose3d.animals.common.lifter3d import load_camera_params, load_h5_keypoints
 
 
 class ArberDataset(Dataset):