AR/VR Hand Tracking + SLAM Mapping in Python with MediaPipe
Build real-time hand tracking with 21-landmark detection and SLAM-based environment mapping using MediaPipe and Open3D. Complete Python implementation for AR/VR spatial interactions.
Hand tracking and spatial mapping are the two foundational capabilities of any serious AR/VR application. This post walks through a complete Python implementation: MediaPipe Hands for 21-point landmark detection, gesture recognition built on top of those landmarks, and a simple SLAM-based environment mapper using depth images and point cloud registration.
All of it runs on CPU (webcam) with optional CUDA acceleration for the SLAM component.
Part 1: MediaPipe Hand Tracking
MediaPipe Hands provides 21 3D landmarks per hand, at 30+ FPS on CPU. Each landmark is a normalized (x, y, z) coordinate relative to the wrist.
Landmark map:
0: WRIST
1-4: THUMB (cmc, mcp, ip, tip)
5-8: INDEX (mcp, pip, dip, tip)
9-12: MIDDLE (mcp, pip, dip, tip)
13-16: RING (mcp, pip, dip, tip)
17-20: PINKY (mcp, pip, dip, tip)
Installation
pip install mediapipe opencv-python numpy open3d
Basic Hand Tracking
import cv2
import mediapipe as mp
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Tuple
@dataclass
class HandLandmarks:
landmarks: np.ndarray # Shape: (21, 3) — x, y, z normalized
handedness: str # "Left" or "Right"
confidence: float
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
def track_hands_from_webcam(
camera_id: int = 0,
max_hands: int = 2,
min_detection_confidence: float = 0.7,
min_tracking_confidence: float = 0.5,
on_frame_callback=None # Called each frame with list[HandLandmarks]
):
"""
Real-time hand tracking from webcam.
Calls on_frame_callback(hands: list[HandLandmarks], frame: np.ndarray) each frame.
"""
cap = cv2.VideoCapture(camera_id)
with mp_hands.Hands(
static_image_mode=False,
max_num_hands=max_hands,
model_complexity=1, # 0=lite, 1=full
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence
) as hands:
while cap.isOpened():
success, frame = cap.read()
if not success:
break
# MediaPipe expects RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
rgb_frame.flags.writeable = False
results = hands.process(rgb_frame)
rgb_frame.flags.writeable = True
detected_hands = []
if results.multi_hand_landmarks:
for hand_landmarks, handedness in zip(
results.multi_hand_landmarks,
results.multi_handedness
):
# Extract landmarks as numpy array
lm_array = np.array([
[lm.x, lm.y, lm.z]
for lm in hand_landmarks.landmark
])
hand = HandLandmarks(
landmarks=lm_array,
handedness=handedness.classification[0].label,
confidence=handedness.classification[0].score
)
detected_hands.append(hand)
# Draw landmarks on frame
mp_drawing.draw_landmarks(
frame, hand_landmarks, mp_hands.HAND_CONNECTIONS
)
if on_frame_callback:
on_frame_callback(detected_hands, frame)
cv2.imshow("Hand Tracking", frame)
if cv2.waitKey(5) & 0xFF == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
Part 2: Gesture Recognition
With 21 landmarks, you can detect complex gestures by analyzing finger extension states and angles:
class GestureRecognizer:
"""Recognize hand gestures from MediaPipe landmarks."""
# Landmark indices for fingertips and PIP joints
FINGER_TIPS = [4, 8, 12, 16, 20]
FINGER_PIPS = [3, 6, 10, 14, 18]
FINGER_MCPS = [2, 5, 9, 13, 17]
def is_finger_extended(
self,
landmarks: np.ndarray,
finger_tip: int,
finger_pip: int
) -> bool:
"""Check if a finger is extended (tip above pip in image space)."""
return landmarks[finger_tip][1] < landmarks[finger_pip][1]
def get_finger_states(self, landmarks: np.ndarray) -> List[bool]:
"""Returns [thumb, index, middle, ring, pinky] extension states."""
states = []
# Thumb: compare to wrist x-position instead of y
thumb_extended = abs(landmarks[4][0] - landmarks[2][0]) > 0.04
states.append(thumb_extended)
# Other fingers
for tip, pip in zip(self.FINGER_TIPS[1:], self.FINGER_PIPS[1:]):
states.append(self.is_finger_extended(landmarks, tip, pip))
return states
def recognize(self, hand: HandLandmarks) -> str:
"""Classify gesture from hand landmarks."""
lm = hand.landmarks
states = self.get_finger_states(lm)
thumb, index, middle, ring, pinky = states
# Gesture definitions
if not any(states):
return "fist"
if all(states):
return "open_hand"
if index and not middle and not ring and not pinky:
return "pointing"
if index and middle and not ring and not pinky:
return "peace"
if thumb and not index and not middle and not ring and pinky:
return "rock_on"
if thumb and index and not middle and not ring and not pinky:
return "gun"
if thumb and not index and not middle and not ring and not pinky:
return "thumbs_up"
# Pinch: thumb tip close to index tip
thumb_tip = lm[4]
index_tip = lm[8]
pinch_dist = np.linalg.norm(thumb_tip[:2] - index_tip[:2])
if pinch_dist < 0.05:
return "pinch"
return "unknown"
def get_pinch_position(self, hand: HandLandmarks) -> Optional[Tuple[float, float]]:
"""Get normalized screen position of pinch gesture."""
lm = hand.landmarks
thumb_tip = lm[4][:2]
index_tip = lm[8][:2]
return tuple((thumb_tip + index_tip) / 2)
# Example usage
recognizer = GestureRecognizer()
def on_frame(hands: List[HandLandmarks], frame: np.ndarray):
for hand in hands:
gesture = recognizer.recognize(hand)
# Draw gesture label
wrist_x = int(hand.landmarks[0][0] * frame.shape[1])
wrist_y = int(hand.landmarks[0][1] * frame.shape[0])
cv2.putText(frame, f"{hand.handedness}: {gesture}",
(wrist_x - 50, wrist_y + 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
track_hands_from_webcam(on_frame_callback=on_frame)
Part 3: SLAM-Based Environment Mapping
SLAM (Simultaneous Localization and Mapping) builds a 3D map of the environment while tracking camera position. Here's a simple point-cloud-based implementation using ORB feature matching for visual odometry:
import open3d as o3d
import cv2
import numpy as np
from collections import deque
class SimpleSLAM:
"""
Simple visual SLAM using ORB features + point cloud accumulation.
For production use, consider RTAB-Map or ORB-SLAM3.
"""
def __init__(self, fx: float = 525.0, fy: float = 525.0,
cx: float = 319.5, cy: float = 239.5):
# Camera intrinsics (default: Kinect-like parameters)
self.K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
self.orb = cv2.ORB_create(nfeatures=2000)
self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
self.current_pose = np.eye(4) # 4x4 transform matrix
self.keyframes = deque(maxlen=50) # Store last 50 keyframes
self.global_map = o3d.geometry.PointCloud()
self.prev_gray = None
self.prev_kp = None
self.prev_desc = None
def process_frame(
self,
color_frame: np.ndarray,
depth_frame: np.ndarray = None # Optional depth for 3D reconstruction
) -> dict:
"""
Process a frame. Returns current pose and any new map points.
"""
gray = cv2.cvtColor(color_frame, cv2.COLOR_BGR2GRAY)
kp, desc = self.orb.detectAndCompute(gray, None)
result = {"pose": self.current_pose.copy(), "new_points": 0, "matches": 0}
if self.prev_desc is not None and desc is not None:
# Match features with previous frame
try:
matches = self.matcher.match(self.prev_desc, desc)
matches = sorted(matches, key=lambda x: x.distance)
good_matches = matches[:min(200, len(matches))]
result["matches"] = len(good_matches)
if len(good_matches) >= 8:
prev_pts = np.float32([self.prev_kp[m.queryIdx].pt for m in good_matches])
curr_pts = np.float32([kp[m.trainIdx].pt for m in good_matches])
# Estimate Essential matrix and recover pose
E, mask = cv2.findEssentialMat(
curr_pts, prev_pts, self.K,
method=cv2.RANSAC, prob=0.999, threshold=1.0
)
if E is not None:
_, R, t, mask = cv2.recoverPose(E, curr_pts, prev_pts, self.K)
# Update pose
delta = np.eye(4)
delta[:3, :3] = R
delta[:3, 3] = t.flatten()
self.current_pose = self.current_pose @ delta
result["pose"] = self.current_pose.copy()
except Exception as e:
pass # Insufficient matches, keep previous pose
# Add depth-colored points to map if depth available
if depth_frame is not None:
self._add_depth_points(color_frame, depth_frame)
result["map_size"] = len(self.global_map.points)
self.prev_gray = gray
self.prev_kp = kp
self.prev_desc = desc
return result
def _add_depth_points(self, color_frame: np.ndarray, depth_frame: np.ndarray):
"""Add colored 3D points from depth frame to global map."""
h, w = depth_frame.shape
valid = depth_frame > 0
# Unproject pixels to 3D
u, v = np.meshgrid(np.arange(w), np.arange(h))
z = depth_frame[valid].astype(float) / 1000.0 # mm to meters
x = (u[valid] - self.K[0,2]) * z / self.K[0,0]
y = (v[valid] - self.K[1,2]) * z / self.K[1,1]
pts_3d = np.column_stack([x, y, z, np.ones(len(z))])
pts_world = (self.current_pose @ pts_3d.T).T[:, :3]
colors = color_frame[valid].astype(float) / 255.0
colors = colors[:, ::-1] # BGR to RGB
chunk = o3d.geometry.PointCloud()
chunk.points = o3d.utility.Vector3dVector(pts_world)
chunk.colors = o3d.utility.Vector3dVector(colors)
self.global_map += chunk
# Downsample periodically
if len(self.global_map.points) > 100000:
self.global_map = self.global_map.voxel_down_sample(voxel_size=0.02)
def save_map(self, path: str = "slam_map.ply"):
"""Export the built map as a PLY point cloud."""
o3d.io.write_point_cloud(path, self.global_map)
print(f"Saved map: {path} ({len(self.global_map.points):,} points)")
# Complete AR session with hand tracking + SLAM
def run_ar_session(camera_id: int = 0):
slam = SimpleSLAM()
recognizer = GestureRecognizer()
cap = cv2.VideoCapture(camera_id)
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7) as hands_model:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Process SLAM
slam_result = slam.process_frame(frame)
# Process hand tracking
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
hand_results = hands_model.process(rgb)
if hand_results.multi_hand_landmarks:
for hl, hw in zip(hand_results.multi_hand_landmarks, hand_results.multi_handedness):
lm = np.array([[l.x, l.y, l.z] for l in hl.landmark])
hand = HandLandmarks(lm, hw.classification[0].label, hw.classification[0].score)
gesture = recognizer.recognize(hand)
cv2.putText(frame, f"{gesture}", (50, 50),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("AR Session", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
slam.save_map("session_map.ply")
cap.release()
cv2.destroyAllWindows()
The AR/VR Workspace includes this full pipeline — SLAM mapping, depth estimation, gesture recognition, spatial audio, and full AR scene building — 813 lines, 25 methods, ready to integrate with Unity, Unreal, or a custom OpenGL renderer.