AR/VR Hand Tracking + SLAM Mapping in Python with MediaPipe

MediaPipe Hand Tracking

Using MediaPipe Hands, track 21 hand landmarks at 30+ FPS. Install with:

pip install mediapipe opencv-python numpy open3d

Code to track hands from webcam:

import cv2, mediapipe as mp, numpy as np
from dataclasses import dataclass

@dataclass
class HandLandmarks:
    landmarks: np.ndarray   # Shape: (21, 3) — x, y, z normalized
    handedness: str          # "Left" or "Right"
    confidence: float

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

def track_hands_from_webcam(
    camera_id: int = 0,
    max_hands: int = 2,
    min_detection_confidence: float = 0.7,
    min_tracking_confidence: float = 0.5,
):
    cap = cv2.VideoCapture(camera_id)
    
    with mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=max_hands,
        model_complexity=1,
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence
    ) as hands:
        
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break
            
            # MediaPipe expects RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(rgb_frame)
            
            detected_hands = []
            
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    lm_array = np.array([
                        [lm.x, lm.y, lm.z]
                        for lm in hand_landmarks.landmark
                    ])
                    handedness = hands.handedness.classification[0].label
                    confidence = hands.handedness.classification[0].score
                    
                    detected_hands.append(HandLandmarks(
                        landmarks=lm_array,
                        handedness=handedness,
                        confidence=confidence
                    ))
                    
                    # Draw landmarks on frame
                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            cv2.imshow("Hand Tracking", frame)
            if cv2.waitKey(5) & 0xFF == ord("q"):
                break
    
    cap.release()
    cv2.destroyAllWindows()

Gesture Recognition

Use MediaPipe landmarks to recognize gestures. Install necessary tools:

pip install numpy open3d

Code for gesture recognition:

class GestureRecognizer:
    FINGER_TIPS = [4, 8, 12, 16, 20]
    
    def is_finger_extended(
        self,
        landmarks: np.ndarray,
        finger_tip: int,
        finger_pip: int
    ) -> bool:
        return landmarks[finger_tip][1] < landmarks[finger_pip][1]
    
    def get_finger_states(self, landmarks: np.ndarray) -> List[bool]:
        states = []
        
        # Thumb: compare to wrist x-position instead of y
        thumb_extended = abs(landmarks[4][0] - landmarks[2][0]) > 0.04
        states.append(thumb_extended)
        
        for tip, pip in zip(self.FINGER_TIPS[1:], [3, 6, 10, 14]):
            states.append(self.is_finger_extended(landmarks, tip, pip))
        
        return states
    
    def recognize(self, hand: HandLandmarks) -> str:
        lm = hand.landmarks
        states = self.get_finger_states(lm)
        thumb, index, middle, ring, pinky = states
        
        # Gesture definitions
        if not any(states):
            return "fist"
        
        if all(states):
            return "open_hand"
        
        if index and not middle and not ring and not pinky:
            return "pointing"
        
        if index and middle and not ring and not pinky:
            return "peace"
        
        if thumb and not index and not middle and not ring and pinky:
            return "rock_on"
        
        if thumb and index and not middle and not ring and not pinky:
            return "gun"
        
        if thumb and not index and not middle and not ring and not pinky:
            return "thumbs_up"
        
        # Pinch: thumb tip close to index tip
        thumb_tip = lm[4]
        index_tip = lm[8]
        pinch_dist = np.linalg.norm(thumb_tip[0:2] - index_tip[0:2])
        if pinch_dist < 0.05:
            return "pinch"
        
        return "unknown"

# Example usage
recognizer = GestureRecognizer()

def on_frame(hands: List[HandLandmarks], frame: np.ndarray):
    for hand in hands:
        gesture = recognizer.recognize(hand)
        
        # Draw gesture label
        wrist_x = int(hand.landmarks[0][0] * frame.shape[1])
        wrist_y = int(hand.landmarks[0][1] * frame.shape[0])
        cv2.putText(frame, f"{hand.handedness}: {gesture}", 
                   (wrist_x - 50, wrist_y + 30),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

track_hands_from_webcam(on_frame_callback=on_frame)

SLAM-Based Environment Mapping

Simple SLAM using ORB features and point cloud accumulation:

import open3d as o3d
import cv2
import numpy as np

class SimpleSLAM:
    def __init__(self, fx: float = 525.0, fy: float = 525.0,
                 cx: float = 319.5, cy: float = 239.5):
        self.K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
        self.orb = cv2.ORB_create(nfeatures=2000)
        self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        
        self.current_pose = np.eye(4)      # 4x4 transform matrix
        self.keyframes = deque(maxlen=50)  # Store last 50 keyframes
        self.global_map = o3d.geometry.PointCloud()
        
        self.prev_gray = None
        self.prev_kp = None
        self.prev_desc = None
    
    def process_frame(self, color_frame: np.ndarray, depth_frame=None):
        gray = cv2.cvtColor(color_frame, cv2.COLOR_BGR2GRAY)
        kp, desc = self.orb.detectAndCompute(gray, None)
        
        result = {"pose": self.current_pose.copy(), "new_points": 0, "matches": 0}
        
        if self.prev_desc is not None and desc is not None:
            matches = self.matcher.match(self.prev_desc, desc)
            good_matches = sorted(matches, key=lambda x: x.distance)[:min(200, len(matches))]
            
            result["matches"] = len(good_matches)
            
            if len(good_matches) >= 8:
                prev_pts = np.float32([self.prev_kp[m.queryIdx].pt for m in good_matches])
                curr_pts = np.float32([kp[m.trainIdx].pt for m in good_matches])
                
                E, mask = cv2.findEssentialMat(
                    curr_pts, prev_pts, self.K,
                    method=cv2.RANSAC, prob=0.999, threshold=1.0
                )
                
                if E is not None:
                    _, R, t, mask = cv2.recoverPose(E, curr_pts, prev_pts, self.K)
                    
                    delta = np.eye(4)
                    delta[:3, :3] = R
                    delta[:3, 3] = t.flatten()
                    self.current_pose = self.current_pose @ delta
                    result["pose"] = self.current_pose.copy()
            
            if depth_frame is not None:
                self._add_depth_points(color_frame, depth_frame)
                result["map_size"] = len(self.global_map.points)
        
        self.prev_gray = gray
        self.prev_kp = kp
        self.prev_desc = desc
        
        return result
    
    def _add_depth_points(self, color_frame: np.ndarray, depth_frame: np.ndarray):
        h, w = depth_frame.shape
        valid = depth_frame > 0
        
        u, v = np.meshgrid(np.arange(w), np.arange(h))
        z = depth_frame[valid].astype(float) / 1000.0
        x = (u[valid] - self.K[0,2]) * z / self.K[0,0]
        y = (v[valid] - self.K[1,2]) * z / self.K[1,1]
        
        pts_3d = np.column_stack([x, y, z, np.ones(len(z))])
        pts_world = (self.current_pose @ pts_3d.T).T[:, :3]
        
        colors = color_frame[valid].astype(float) / 255.0
        colors = colors[:, ::-1]  # BGR to RGB
        
        chunk = o3d.geometry.PointCloud()
        chunk.points = o3d.utility.Vector3dVector(pts_world)
        chunk.colors = o3d.utility.Vector3dVector(colors)
        
        self.global_map += chunk
        
        if len(self.global_map.points) > 100000:
            self.global_map = self.global_map.voxel_down_sample(voxel_size=0.02)
    
    def save_map(self, path: str = "slam_map.ply"):
        o3d.io.write_point_cloud(path, self.global_map)
        print(f"Saved map: {path} ({len(self.global_map.points):,} points)")

# Complete AR session
def run_ar_session(camera_id: int = 0):
    slam = SimpleSLAM()
    recognizer = GestureRecognizer()
    cap = cv2.VideoCapture(camera_id)
    
    with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7) as hands_model:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            slam_result = slam.process_frame(frame)
            
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            hand_results = hands_model.process(rgb)
            
            if hand_results.multi_hand_landmarks:
                for hl, hw in zip(hand_results.multi_hand_landmarks, hand_results.multi_handedness):
                    lm = np.array([[l.x, l.y, l.z] for l in hl.landmark])
                    hand = HandLandmarks(lm, hw.classification[0].label, hw.classification[0].score)
                    gesture = recognizer.recognize(hand)
                    
                    cv2.putText(frame, f"{gesture}", (50, 50),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            cv2.imshow("AR Session", frame)
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
    
    slam.save_map("session_map.ply")
    cap.release()
    cv2.destroyAllWindows()

# Run AR session
run_ar_session()

The full AR/VR Workspace includes this pipeline, ready to integrate with Unity, Unreal, or a custom OpenGL renderer. Check out my real AI tools at axon.nepa-ai.com.

AR/VR Hand Tracking + SLAM Mapping in Python with MediaPipe

MediaPipe Hand Tracking

Gesture Recognition

SLAM-Based Environment Mapping

AI Video Editing: Auto-Cut with YOLO Detection + Whisper Captions

AI Frame Interpolation with RIFE: Smooth Any Video to 60fps

AI Image Upscaling: Real-ESRGAN vs Lanczos — Real Benchmarks

MediaPipe Hand Tracking

Gesture Recognition

SLAM-Based Environment Mapping

Related Reads

AI Video Editing: Auto-Cut with YOLO Detection + Whisper Captions

AI Frame Interpolation with RIFE: Smooth Any Video to 60fps

AI Image Upscaling: Real-ESRGAN vs Lanczos — Real Benchmarks