Back to Blog
2026-03-22

AR/VR Hand Tracking + SLAM Mapping in Python with MediaPipe

Build real-time hand tracking with 21-landmark detection and SLAM-based environment mapping using MediaPipe and Open3D. Complete Python implementation for AR/VR spatial interactions.

Hand tracking and spatial mapping are the two foundational capabilities of any serious AR/VR application. This post walks through a complete Python implementation: MediaPipe Hands for 21-point landmark detection, gesture recognition built on top of those landmarks, and a simple SLAM-based environment mapper using depth images and point cloud registration.

All of it runs on CPU (webcam) with optional CUDA acceleration for the SLAM component.

Part 1: MediaPipe Hand Tracking

MediaPipe Hands provides 21 3D landmarks per hand, at 30+ FPS on CPU. Each landmark is a normalized (x, y, z) coordinate relative to the wrist.

Landmark map:
0: WRIST
1-4: THUMB (cmc, mcp, ip, tip)
5-8: INDEX (mcp, pip, dip, tip)
9-12: MIDDLE (mcp, pip, dip, tip)
13-16: RING (mcp, pip, dip, tip)
17-20: PINKY (mcp, pip, dip, tip)

Installation

pip install mediapipe opencv-python numpy open3d

Basic Hand Tracking

import cv2
import mediapipe as mp
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Tuple

@dataclass
class HandLandmarks:
    landmarks: np.ndarray   # Shape: (21, 3) — x, y, z normalized
    handedness: str          # "Left" or "Right"
    confidence: float

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

def track_hands_from_webcam(
    camera_id: int = 0,
    max_hands: int = 2,
    min_detection_confidence: float = 0.7,
    min_tracking_confidence: float = 0.5,
    on_frame_callback=None    # Called each frame with list[HandLandmarks]
):
    """
    Real-time hand tracking from webcam.
    Calls on_frame_callback(hands: list[HandLandmarks], frame: np.ndarray) each frame.
    """
    cap = cv2.VideoCapture(camera_id)
    
    with mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=max_hands,
        model_complexity=1,     # 0=lite, 1=full
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence
    ) as hands:
        
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break
            
            # MediaPipe expects RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            rgb_frame.flags.writeable = False
            results = hands.process(rgb_frame)
            rgb_frame.flags.writeable = True
            
            detected_hands = []
            
            if results.multi_hand_landmarks:
                for hand_landmarks, handedness in zip(
                    results.multi_hand_landmarks,
                    results.multi_handedness
                ):
                    # Extract landmarks as numpy array
                    lm_array = np.array([
                        [lm.x, lm.y, lm.z]
                        for lm in hand_landmarks.landmark
                    ])
                    
                    hand = HandLandmarks(
                        landmarks=lm_array,
                        handedness=handedness.classification[0].label,
                        confidence=handedness.classification[0].score
                    )
                    detected_hands.append(hand)
                    
                    # Draw landmarks on frame
                    mp_drawing.draw_landmarks(
                        frame, hand_landmarks, mp_hands.HAND_CONNECTIONS
                    )
            
            if on_frame_callback:
                on_frame_callback(detected_hands, frame)
            
            cv2.imshow("Hand Tracking", frame)
            if cv2.waitKey(5) & 0xFF == ord("q"):
                break
    
    cap.release()
    cv2.destroyAllWindows()

Part 2: Gesture Recognition

With 21 landmarks, you can detect complex gestures by analyzing finger extension states and angles:

class GestureRecognizer:
    """Recognize hand gestures from MediaPipe landmarks."""
    
    # Landmark indices for fingertips and PIP joints
    FINGER_TIPS = [4, 8, 12, 16, 20]
    FINGER_PIPS = [3, 6, 10, 14, 18]
    FINGER_MCPS = [2, 5, 9, 13, 17]
    
    def is_finger_extended(
        self, 
        landmarks: np.ndarray, 
        finger_tip: int, 
        finger_pip: int
    ) -> bool:
        """Check if a finger is extended (tip above pip in image space)."""
        return landmarks[finger_tip][1] < landmarks[finger_pip][1]
    
    def get_finger_states(self, landmarks: np.ndarray) -> List[bool]:
        """Returns [thumb, index, middle, ring, pinky] extension states."""
        states = []
        
        # Thumb: compare to wrist x-position instead of y
        thumb_extended = abs(landmarks[4][0] - landmarks[2][0]) > 0.04
        states.append(thumb_extended)
        
        # Other fingers
        for tip, pip in zip(self.FINGER_TIPS[1:], self.FINGER_PIPS[1:]):
            states.append(self.is_finger_extended(landmarks, tip, pip))
        
        return states
    
    def recognize(self, hand: HandLandmarks) -> str:
        """Classify gesture from hand landmarks."""
        lm = hand.landmarks
        states = self.get_finger_states(lm)
        thumb, index, middle, ring, pinky = states
        
        # Gesture definitions
        if not any(states):
            return "fist"
        
        if all(states):
            return "open_hand"
        
        if index and not middle and not ring and not pinky:
            return "pointing"
        
        if index and middle and not ring and not pinky:
            return "peace"
        
        if thumb and not index and not middle and not ring and pinky:
            return "rock_on"
        
        if thumb and index and not middle and not ring and not pinky:
            return "gun"
        
        if thumb and not index and not middle and not ring and not pinky:
            return "thumbs_up"
        
        # Pinch: thumb tip close to index tip
        thumb_tip = lm[4]
        index_tip = lm[8]
        pinch_dist = np.linalg.norm(thumb_tip[:2] - index_tip[:2])
        if pinch_dist < 0.05:
            return "pinch"
        
        return "unknown"
    
    def get_pinch_position(self, hand: HandLandmarks) -> Optional[Tuple[float, float]]:
        """Get normalized screen position of pinch gesture."""
        lm = hand.landmarks
        thumb_tip = lm[4][:2]
        index_tip = lm[8][:2]
        return tuple((thumb_tip + index_tip) / 2)

# Example usage
recognizer = GestureRecognizer()

def on_frame(hands: List[HandLandmarks], frame: np.ndarray):
    for hand in hands:
        gesture = recognizer.recognize(hand)
        
        # Draw gesture label
        wrist_x = int(hand.landmarks[0][0] * frame.shape[1])
        wrist_y = int(hand.landmarks[0][1] * frame.shape[0])
        cv2.putText(frame, f"{hand.handedness}: {gesture}", 
                   (wrist_x - 50, wrist_y + 30),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

track_hands_from_webcam(on_frame_callback=on_frame)

Part 3: SLAM-Based Environment Mapping

SLAM (Simultaneous Localization and Mapping) builds a 3D map of the environment while tracking camera position. Here's a simple point-cloud-based implementation using ORB feature matching for visual odometry:

import open3d as o3d
import cv2
import numpy as np
from collections import deque

class SimpleSLAM:
    """
    Simple visual SLAM using ORB features + point cloud accumulation.
    For production use, consider RTAB-Map or ORB-SLAM3.
    """
    
    def __init__(self, fx: float = 525.0, fy: float = 525.0,
                 cx: float = 319.5, cy: float = 239.5):
        # Camera intrinsics (default: Kinect-like parameters)
        self.K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
        
        self.orb = cv2.ORB_create(nfeatures=2000)
        self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        
        self.current_pose = np.eye(4)      # 4x4 transform matrix
        self.keyframes = deque(maxlen=50)  # Store last 50 keyframes
        self.global_map = o3d.geometry.PointCloud()
        
        self.prev_gray = None
        self.prev_kp = None
        self.prev_desc = None
    
    def process_frame(
        self, 
        color_frame: np.ndarray,
        depth_frame: np.ndarray = None   # Optional depth for 3D reconstruction
    ) -> dict:
        """
        Process a frame. Returns current pose and any new map points.
        """
        gray = cv2.cvtColor(color_frame, cv2.COLOR_BGR2GRAY)
        kp, desc = self.orb.detectAndCompute(gray, None)
        
        result = {"pose": self.current_pose.copy(), "new_points": 0, "matches": 0}
        
        if self.prev_desc is not None and desc is not None:
            # Match features with previous frame
            try:
                matches = self.matcher.match(self.prev_desc, desc)
                matches = sorted(matches, key=lambda x: x.distance)
                good_matches = matches[:min(200, len(matches))]
                
                result["matches"] = len(good_matches)
                
                if len(good_matches) >= 8:
                    prev_pts = np.float32([self.prev_kp[m.queryIdx].pt for m in good_matches])
                    curr_pts = np.float32([kp[m.trainIdx].pt for m in good_matches])
                    
                    # Estimate Essential matrix and recover pose
                    E, mask = cv2.findEssentialMat(
                        curr_pts, prev_pts, self.K,
                        method=cv2.RANSAC, prob=0.999, threshold=1.0
                    )
                    
                    if E is not None:
                        _, R, t, mask = cv2.recoverPose(E, curr_pts, prev_pts, self.K)
                        
                        # Update pose
                        delta = np.eye(4)
                        delta[:3, :3] = R
                        delta[:3, 3] = t.flatten()
                        self.current_pose = self.current_pose @ delta
                        result["pose"] = self.current_pose.copy()
            
            except Exception as e:
                pass  # Insufficient matches, keep previous pose
            
            # Add depth-colored points to map if depth available
            if depth_frame is not None:
                self._add_depth_points(color_frame, depth_frame)
                result["map_size"] = len(self.global_map.points)
        
        self.prev_gray = gray
        self.prev_kp = kp
        self.prev_desc = desc
        
        return result
    
    def _add_depth_points(self, color_frame: np.ndarray, depth_frame: np.ndarray):
        """Add colored 3D points from depth frame to global map."""
        h, w = depth_frame.shape
        valid = depth_frame > 0
        
        # Unproject pixels to 3D
        u, v = np.meshgrid(np.arange(w), np.arange(h))
        z = depth_frame[valid].astype(float) / 1000.0  # mm to meters
        x = (u[valid] - self.K[0,2]) * z / self.K[0,0]
        y = (v[valid] - self.K[1,2]) * z / self.K[1,1]
        
        pts_3d = np.column_stack([x, y, z, np.ones(len(z))])
        pts_world = (self.current_pose @ pts_3d.T).T[:, :3]
        
        colors = color_frame[valid].astype(float) / 255.0
        colors = colors[:, ::-1]  # BGR to RGB
        
        chunk = o3d.geometry.PointCloud()
        chunk.points = o3d.utility.Vector3dVector(pts_world)
        chunk.colors = o3d.utility.Vector3dVector(colors)
        
        self.global_map += chunk
        
        # Downsample periodically
        if len(self.global_map.points) > 100000:
            self.global_map = self.global_map.voxel_down_sample(voxel_size=0.02)
    
    def save_map(self, path: str = "slam_map.ply"):
        """Export the built map as a PLY point cloud."""
        o3d.io.write_point_cloud(path, self.global_map)
        print(f"Saved map: {path} ({len(self.global_map.points):,} points)")

# Complete AR session with hand tracking + SLAM
def run_ar_session(camera_id: int = 0):
    slam = SimpleSLAM()
    recognizer = GestureRecognizer()
    cap = cv2.VideoCapture(camera_id)
    
    with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7) as hands_model:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            # Process SLAM
            slam_result = slam.process_frame(frame)
            
            # Process hand tracking
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            hand_results = hands_model.process(rgb)
            
            if hand_results.multi_hand_landmarks:
                for hl, hw in zip(hand_results.multi_hand_landmarks, hand_results.multi_handedness):
                    lm = np.array([[l.x, l.y, l.z] for l in hl.landmark])
                    hand = HandLandmarks(lm, hw.classification[0].label, hw.classification[0].score)
                    gesture = recognizer.recognize(hand)
                    
                    cv2.putText(frame, f"{gesture}", (50, 50),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            cv2.imshow("AR Session", frame)
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
    
    slam.save_map("session_map.ply")
    cap.release()
    cv2.destroyAllWindows()

The AR/VR Workspace includes this full pipeline — SLAM mapping, depth estimation, gesture recognition, spatial audio, and full AR scene building — 813 lines, 25 methods, ready to integrate with Unity, Unreal, or a custom OpenGL renderer.

→ Get AR/VR Workspace on the Shop