MediaPipe Hand Tracking
Using MediaPipe Hands, track 21 hand landmarks at 30+ FPS. Install with:
pip install mediapipe opencv-python numpy open3d
Code to track hands from webcam:
import cv2, mediapipe as mp, numpy as np
from dataclasses import dataclass
@dataclass
class HandLandmarks:
landmarks: np.ndarray # Shape: (21, 3) — x, y, z normalized
handedness: str # "Left" or "Right"
confidence: float
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
def track_hands_from_webcam(
camera_id: int = 0,
max_hands: int = 2,
min_detection_confidence: float = 0.7,
min_tracking_confidence: float = 0.5,
):
cap = cv2.VideoCapture(camera_id)
with mp_hands.Hands(
static_image_mode=False,
max_num_hands=max_hands,
model_complexity=1,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence
) as hands:
while cap.isOpened():
success, frame = cap.read()
if not success:
break
# MediaPipe expects RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(rgb_frame)
detected_hands = []
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
lm_array = np.array([
[lm.x, lm.y, lm.z]
for lm in hand_landmarks.landmark
])
handedness = hands.handedness.classification[0].label
confidence = hands.handedness.classification[0].score
detected_hands.append(HandLandmarks(
landmarks=lm_array,
handedness=handedness,
confidence=confidence
))
# Draw landmarks on frame
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow("Hand Tracking", frame)
if cv2.waitKey(5) & 0xFF == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
Gesture Recognition
Use MediaPipe landmarks to recognize gestures. Install necessary tools:
pip install numpy open3d
Code for gesture recognition:
class GestureRecognizer:
FINGER_TIPS = [4, 8, 12, 16, 20]
def is_finger_extended(
self,
landmarks: np.ndarray,
finger_tip: int,
finger_pip: int
) -> bool:
return landmarks[finger_tip][1] < landmarks[finger_pip][1]
def get_finger_states(self, landmarks: np.ndarray) -> List[bool]:
states = []
# Thumb: compare to wrist x-position instead of y
thumb_extended = abs(landmarks[4][0] - landmarks[2][0]) > 0.04
states.append(thumb_extended)
for tip, pip in zip(self.FINGER_TIPS[1:], [3, 6, 10, 14]):
states.append(self.is_finger_extended(landmarks, tip, pip))
return states
def recognize(self, hand: HandLandmarks) -> str:
lm = hand.landmarks
states = self.get_finger_states(lm)
thumb, index, middle, ring, pinky = states
# Gesture definitions
if not any(states):
return "fist"
if all(states):
return "open_hand"
if index and not middle and not ring and not pinky:
return "pointing"
if index and middle and not ring and not pinky:
return "peace"
if thumb and not index and not middle and not ring and pinky:
return "rock_on"
if thumb and index and not middle and not ring and not pinky:
return "gun"
if thumb and not index and not middle and not ring and not pinky:
return "thumbs_up"
# Pinch: thumb tip close to index tip
thumb_tip = lm[4]
index_tip = lm[8]
pinch_dist = np.linalg.norm(thumb_tip[0:2] - index_tip[0:2])
if pinch_dist < 0.05:
return "pinch"
return "unknown"
# Example usage
recognizer = GestureRecognizer()
def on_frame(hands: List[HandLandmarks], frame: np.ndarray):
for hand in hands:
gesture = recognizer.recognize(hand)
# Draw gesture label
wrist_x = int(hand.landmarks[0][0] * frame.shape[1])
wrist_y = int(hand.landmarks[0][1] * frame.shape[0])
cv2.putText(frame, f"{hand.handedness}: {gesture}",
(wrist_x - 50, wrist_y + 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
track_hands_from_webcam(on_frame_callback=on_frame)
SLAM-Based Environment Mapping
Simple SLAM using ORB features and point cloud accumulation:
import open3d as o3d
import cv2
import numpy as np
class SimpleSLAM:
def __init__(self, fx: float = 525.0, fy: float = 525.0,
cx: float = 319.5, cy: float = 239.5):
self.K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
self.orb = cv2.ORB_create(nfeatures=2000)
self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
self.current_pose = np.eye(4) # 4x4 transform matrix
self.keyframes = deque(maxlen=50) # Store last 50 keyframes
self.global_map = o3d.geometry.PointCloud()
self.prev_gray = None
self.prev_kp = None
self.prev_desc = None
def process_frame(self, color_frame: np.ndarray, depth_frame=None):
gray = cv2.cvtColor(color_frame, cv2.COLOR_BGR2GRAY)
kp, desc = self.orb.detectAndCompute(gray, None)
result = {"pose": self.current_pose.copy(), "new_points": 0, "matches": 0}
if self.prev_desc is not None and desc is not None:
matches = self.matcher.match(self.prev_desc, desc)
good_matches = sorted(matches, key=lambda x: x.distance)[:min(200, len(matches))]
result["matches"] = len(good_matches)
if len(good_matches) >= 8:
prev_pts = np.float32([self.prev_kp[m.queryIdx].pt for m in good_matches])
curr_pts = np.float32([kp[m.trainIdx].pt for m in good_matches])
E, mask = cv2.findEssentialMat(
curr_pts, prev_pts, self.K,
method=cv2.RANSAC, prob=0.999, threshold=1.0
)
if E is not None:
_, R, t, mask = cv2.recoverPose(E, curr_pts, prev_pts, self.K)
delta = np.eye(4)
delta[:3, :3] = R
delta[:3, 3] = t.flatten()
self.current_pose = self.current_pose @ delta
result["pose"] = self.current_pose.copy()
if depth_frame is not None:
self._add_depth_points(color_frame, depth_frame)
result["map_size"] = len(self.global_map.points)
self.prev_gray = gray
self.prev_kp = kp
self.prev_desc = desc
return result
def _add_depth_points(self, color_frame: np.ndarray, depth_frame: np.ndarray):
h, w = depth_frame.shape
valid = depth_frame > 0
u, v = np.meshgrid(np.arange(w), np.arange(h))
z = depth_frame[valid].astype(float) / 1000.0
x = (u[valid] - self.K[0,2]) * z / self.K[0,0]
y = (v[valid] - self.K[1,2]) * z / self.K[1,1]
pts_3d = np.column_stack([x, y, z, np.ones(len(z))])
pts_world = (self.current_pose @ pts_3d.T).T[:, :3]
colors = color_frame[valid].astype(float) / 255.0
colors = colors[:, ::-1] # BGR to RGB
chunk = o3d.geometry.PointCloud()
chunk.points = o3d.utility.Vector3dVector(pts_world)
chunk.colors = o3d.utility.Vector3dVector(colors)
self.global_map += chunk
if len(self.global_map.points) > 100000:
self.global_map = self.global_map.voxel_down_sample(voxel_size=0.02)
def save_map(self, path: str = "slam_map.ply"):
o3d.io.write_point_cloud(path, self.global_map)
print(f"Saved map: {path} ({len(self.global_map.points):,} points)")
# Complete AR session
def run_ar_session(camera_id: int = 0):
slam = SimpleSLAM()
recognizer = GestureRecognizer()
cap = cv2.VideoCapture(camera_id)
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7) as hands_model:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
slam_result = slam.process_frame(frame)
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
hand_results = hands_model.process(rgb)
if hand_results.multi_hand_landmarks:
for hl, hw in zip(hand_results.multi_hand_landmarks, hand_results.multi_handedness):
lm = np.array([[l.x, l.y, l.z] for l in hl.landmark])
hand = HandLandmarks(lm, hw.classification[0].label, hw.classification[0].score)
gesture = recognizer.recognize(hand)
cv2.putText(frame, f"{gesture}", (50, 50),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("AR Session", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
slam.save_map("session_map.ply")
cap.release()
cv2.destroyAllWindows()
# Run AR session
run_ar_session()
The full AR/VR Workspace includes this pipeline, ready to integrate with Unity, Unreal, or a custom OpenGL renderer. Check out my real AI tools at axon.nepa-ai.com.


